Dear StackOverflow community,
Please consider this pdf file as an example
The file contains metadata on several variables of my datasets, and I am trying to scrape it to build a dataframe storing its information (name, description, input variable, type, etc.) as systematically as possible. The final product I'm trying to have is:
structure(list(name = c("VARNAME1", "VARNAME3", "VARNAME4", "VARNAME8"
), description = c("The description of the variable is here. Sometimes the description mentions variables like VARNAME6.",
"Another variable description is here.", "Another variable description is here.",
"Another variable description is here."), input = c("VARNAME2",
"VARNAME10", "VARNAME15", "VARNAME123"), type = c("Categorical",
"Categorical", "Numerical", "Categorical"), categories = c("01 = First category, 02 = Second category, 03 = Third category",
"01 = Category 1, 02 = Category 2, 03 = Category 3, 04 = Category 4, 05 = Category 5",
NA, "01 = Yes, 02 = No, 03 = I don't know/Missing"), format = c(NA,
NA, "Integer", NA), minimum = c(NA, NA, "XX", NA), maximum = c(NA,
NA, "XXX", NA)), class = "data.frame", row.names = c(NA, -4L))
Using the function pdf_text()
from package pdftools
, I assumed every word written in uppercase in one single line was a variable name, so I stored the index number of every match in a vector, and worked within element i and element i+1 to isolate each metadata separately. However there are cases like VARNAME1 in the example where the description doesn't fit in one line and where the end of the sentence is only a variable name (e.g., VARNAME6 here).
I must now use function pdf_data()
from the same package pdftools
as it gives information on the font size (bigger for variable names), and boldness (bold for variable names and descriptions) thanks to option font_info = TRUE
. But I'm not sure I understand how can I make this work as the output of the function is a list of n dataframes (where n = number of pages), with each obs. being one word/textbox.
Here's the output of the pdf_data function for my example pdf:
list(structure(list(width = c(68L, 18L, 55L, 10L, 16L, 40L, 7L,
25L, 55L, 16L, 55L, 47L, 44L, 17L, 61L, 25L, 38L, 5L, 57L, 16L,
9L, 31L, 19L, 10L, 37L, 21L, 42L, 35L, 42L, 25L, 42L, 12L, 12L,
12L, 68L, 41L, 40L, 55L, 7L, 25L, 25L, 38L, 5L, 63L, 16L, 9L,
28L, 12L, 12L, 12L, 12L, 12L, 43L, 6L, 43L, 6L, 43L, 6L, 43L,
6L, 43L, 6L, 68L, 41L, 40L, 55L, 7L, 25L, 25L, 38L, 5L, 63L,
50L, 38L, 38L, 35L, 50L, 12L, 52L, 18L, 68L, 41L, 40L, 55L, 25L,
25L, 38L, 5L, 69L, 12L, 16L), height = c(14L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 14L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 14L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 14L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), x = c(70L, 70L,
92L, 150L, 163L, 182L, 225L, 236L, 265L, 323L, 342L, 401L, 451L,
498L, 70L, 70L, 99L, 141L, 149L, 70L, 90L, 102L, 303L, 325L,
338L, 303L, 327L, 303L, 341L, 303L, 331L, 76L, 76L, 76L, 70L,
70L, 114L, 157L, 216L, 226L, 70L, 99L, 141L, 149L, 70L, 90L,
102L, 76L, 76L, 76L, 76L, 76L, 305L, 352L, 305L, 352L, 305L,
352L, 305L, 352L, 305L, 352L, 70L, 70L, 114L, 157L, 216L, 226L,
70L, 99L, 141L, 149L, 70L, 123L, 70L, 112L, 70L, 124L, 70L, 126L,
70L, 70L, 114L, 157L, 216L, 70L, 99L, 141L, 149L, 178L, 303L),
y = c(73L, 99L, 99L, 99L, 99L, 99L, 99L, 99L, 99L, 99L, 99L,
99L, 99L, 99L, 115L, 139L, 139L, 139L, 139L, 163L, 163L,
163L, 187L, 187L, 187L, 202L, 202L, 217L, 217L, 232L, 232L,
202L, 217L, 232L, 272L, 298L, 298L, 298L, 298L, 298L, 322L,
322L, 322L, 322L, 345L, 345L, 345L, 375L, 390L, 406L, 421L,
436L, 375L, 375L, 390L, 390L, 406L, 406L, 421L, 421L, 436L,
436L, 478L, 504L, 504L, 504L, 504L, 504L, 528L, 528L, 528L,
528L, 551L, 551L, 575L, 575L, 599L, 599L, 623L, 623L, 671L,
697L, 697L, 697L, 697L, 721L, 721L, 721L, 721L, 745L, 745L
), space = c(FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE,
FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE,
TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE,
FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE,
TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE,
TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE,
TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE), text = c("VARNAME1",
"The", "description", "of", "the", "variable", "is", "here.",
"Sometimes", "the", "description", "mentions", "variables",
"like", "VARNAME6.", "Input", "variable", "=", "VARNAME2",
"List", "of", "codes:", "N/A", "or", "Missing", "First",
"category", "Second", "category", "Third", "category", "01",
"02", "03", "VARNAME3", "Another", "variable", "description",
"is", "here.", "Input", "variable", "=", "VARNAME10", "List",
"of", "codes", "01", "02", "03", "04", "05", "Category",
"1", "Category", "2", "Category", "3", "Category", "4", "Category",
"5", "VARNAME4", "Another", "variable", "description", "is",
"here.", "Input", "variable", "=", "VARNAME15", "Numerical",
"variable", "Format:", "Integer", "Minimum:", "XX", "Maximum:",
"XXX", "VARNAME8", "Another", "variable", "description",
"here.", "Input", "variable", "=", "VARNAME123", "01", "Yes"
), font_name = c("BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDEEE+Calibri-Bold", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDEEE+Calibri-Bold", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold",
"BCDEEE+Calibri-Bold", "BCDEEE+Calibri-Bold", "BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri"), font_size = c(14.04, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 14.04,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 14.04, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 14.04, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -91L)), structure(list(
width = c(12L, 12L, 14L, 3L, 26L, 68L), height = c(12L, 12L,
12L, 12L, 12L, 12L), x = c(76L, 76L, 303L, 303L, 308L, 337L
), y = c(73L, 88L, 73L, 88L, 88L, 88L), space = c(FALSE,
FALSE, FALSE, TRUE, TRUE, FALSE), text = c("02", "03", "No",
"I", "don’t", "know/Missing"), font_name = c("BCDFEE+Calibri",
"BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri", "BCDFEE+Calibri",
"BCDFEE+Calibri"), font_size = c(12, 12, 12, 12, 12, 12)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L)))
I am of course open to other methods of scraping pdfs, however I'm just a beginner on R and I never had any experience with any other software/language. Any help would be appreciated!