Turning a PDF into a dataframe using pdf_data() from pdftools

50 views Asked by At

Dear StackOverflow community,

Please consider this pdf file as an example

The file contains metadata on several variables of my datasets, and I am trying to scrape it to build a dataframe storing its information (name, description, input variable, type, etc.) as systematically as possible. The final product I'm trying to have is:

structure(list(name = c("VARNAME1", "VARNAME3", "VARNAME4", "VARNAME8"
), description = c("The description of the variable is here. Sometimes the description mentions variables like VARNAME6.", 
"Another variable description is here.", "Another variable description is here.", 
"Another variable description is here."), input = c("VARNAME2", 
"VARNAME10", "VARNAME15", "VARNAME123"), type = c("Categorical", 
"Categorical", "Numerical", "Categorical"), categories = c("01 = First category, 02 = Second category, 03 = Third category", 
"01 = Category 1, 02 = Category 2, 03 = Category 3, 04 = Category 4, 05 = Category 5", 
NA, "01 = Yes, 02 = No, 03 = I don't know/Missing"), format = c(NA, 
NA, "Integer", NA), minimum = c(NA, NA, "XX", NA), maximum = c(NA, 
NA, "XXX", NA)), class = "data.frame", row.names = c(NA, -4L))

Using the function pdf_text() from package pdftools, I assumed every word written in uppercase in one single line was a variable name, so I stored the index number of every match in a vector, and worked within element i and element i+1 to isolate each metadata separately. However there are cases like VARNAME1 in the example where the description doesn't fit in one line and where the end of the sentence is only a variable name (e.g., VARNAME6 here).

I must now use function pdf_data() from the same package pdftools as it gives information on the font size (bigger for variable names), and boldness (bold for variable names and descriptions) thanks to option font_info = TRUE. But I'm not sure I understand how can I make this work as the output of the function is a list of n dataframes (where n = number of pages), with each obs. being one word/textbox.

Here's the output of the pdf_data function for my example pdf:

list(structure(list(width = c(68L, 18L, 55L, 10L, 16L, 40L, 7L, 
25L, 55L, 16L, 55L, 47L, 44L, 17L, 61L, 25L, 38L, 5L, 57L, 16L, 
9L, 31L, 19L, 10L, 37L, 21L, 42L, 35L, 42L, 25L, 42L, 12L, 12L, 
12L, 68L, 41L, 40L, 55L, 7L, 25L, 25L, 38L, 5L, 63L, 16L, 9L, 
28L, 12L, 12L, 12L, 12L, 12L, 43L, 6L, 43L, 6L, 43L, 6L, 43L, 
6L, 43L, 6L, 68L, 41L, 40L, 55L, 7L, 25L, 25L, 38L, 5L, 63L, 
50L, 38L, 38L, 35L, 50L, 12L, 52L, 18L, 68L, 41L, 40L, 55L, 25L, 
25L, 38L, 5L, 69L, 12L, 16L), height = c(14L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 14L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 14L, 12L, 12L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 14L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), x = c(70L, 70L, 
92L, 150L, 163L, 182L, 225L, 236L, 265L, 323L, 342L, 401L, 451L, 
498L, 70L, 70L, 99L, 141L, 149L, 70L, 90L, 102L, 303L, 325L, 
338L, 303L, 327L, 303L, 341L, 303L, 331L, 76L, 76L, 76L, 70L, 
70L, 114L, 157L, 216L, 226L, 70L, 99L, 141L, 149L, 70L, 90L, 
102L, 76L, 76L, 76L, 76L, 76L, 305L, 352L, 305L, 352L, 305L, 
352L, 305L, 352L, 305L, 352L, 70L, 70L, 114L, 157L, 216L, 226L, 
70L, 99L, 141L, 149L, 70L, 123L, 70L, 112L, 70L, 124L, 70L, 126L, 
70L, 70L, 114L, 157L, 216L, 70L, 99L, 141L, 149L, 178L, 303L), 
    y = c(73L, 99L, 99L, 99L, 99L, 99L, 99L, 99L, 99L, 99L, 99L, 
    99L, 99L, 99L, 115L, 139L, 139L, 139L, 139L, 163L, 163L, 
    163L, 187L, 187L, 187L, 202L, 202L, 217L, 217L, 232L, 232L, 
    202L, 217L, 232L, 272L, 298L, 298L, 298L, 298L, 298L, 322L, 
    322L, 322L, 322L, 345L, 345L, 345L, 375L, 390L, 406L, 421L, 
    436L, 375L, 375L, 390L, 390L, 406L, 406L, 421L, 421L, 436L, 
    436L, 478L, 504L, 504L, 504L, 504L, 504L, 528L, 528L, 528L, 
    528L, 551L, 551L, 575L, 575L, 599L, 599L, 623L, 623L, 671L, 
    697L, 697L, 697L, 697L, 721L, 721L, 721L, 721L, 745L, 745L
    ), space = c(FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
    TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, 
    FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, 
    TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, 
    TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, 
    FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, 
    FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, 
    TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, 
    TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, 
    TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE), text = c("VARNAME1", 
    "The", "description", "of", "the", "variable", "is", "here.", 
    "Sometimes", "the", "description", "mentions", "variables", 
    "like", "VARNAME6.", "Input", "variable", "=", "VARNAME2", 
    "List", "of", "codes:", "N/A", "or", "Missing", "First", 
    "category", "Second", "category", "Third", "category", "01", 
    "02", "03", "VARNAME3", "Another", "variable", "description", 
    "is", "here.", "Input", "variable", "=", "VARNAME10", "List", 
    "of", "codes", "01", "02", "03", "04", "05", "Category", 
    "1", "Category", "2", "Category", "3", "Category", "4", "Category", 
    "5", "VARNAME4", "Another", "variable", "description", "is", 
    "here.", "Input", "variable", "=", "VARNAME15", "Numerical", 
    "variable", "Format:", "Integer", "Minimum:", "XX", "Maximum:", 
    "XXX", "VARNAME8", "Another", "variable", "description", 
    "here.", "Input", "variable", "=", "VARNAME123", "01", "Yes"
    ), font_name = c("BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDEEE+Calibri-Bold", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDEEE+Calibri-Bold", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", 
    "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri"), font_size = c(14.04, 12, 12, 12, 12, 12, 
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 14.04, 
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 14.04, 12, 
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 
    12, 14.04, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -91L)), structure(list(
    width = c(12L, 12L, 14L, 3L, 26L, 68L), height = c(12L, 12L, 
    12L, 12L, 12L, 12L), x = c(76L, 76L, 303L, 303L, 308L, 337L
    ), y = c(73L, 88L, 73L, 88L, 88L, 88L), space = c(FALSE, 
    FALSE, FALSE, TRUE, TRUE, FALSE), text = c("02", "03", "No", 
    "I", "don’t", "know/Missing"), font_name = c("BCDFEE+Calibri", 
    "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", 
    "BCDFEE+Calibri"), font_size = c(12, 12, 12, 12, 12, 12)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -6L)))

I am of course open to other methods of scraping pdfs, however I'm just a beginner on R and I never had any experience with any other software/language. Any help would be appreciated!

0

There are 0 answers