Splitting mutliple columns of table extracted from pdf into multiple columns

37 views Asked by At

I extracted a table from a pdf using

pdf_file <- "UBI Kenya paper.pdf"
for(i in 59:68) {
  table_i <- extract_tables(pdf_file, pages = i) 
  tname <- paste0("table_E.", i-58, "_full")
  assign(tname,table_i)
}

I didn't use as.data.frame because it returned an error message. Instead I did this:

table_E.9_full <- data.table::as.data.table(table_E.9_full)
table_E.9 <- table_E.9_full[-c(10:16),]

This gives me a table which looks like this: enter image description here where the Enterprises and Net Revenues columns are combined into one for every category (Retail, Manufacturing, Transportation, Services) separated by a space.

when what I want is it to look like the original here: enter image description here

How do I split rows 2-9 into two columns each for V2-V5, so that columns titles would be "Retail Trade - # Enterprises", "Retail Trade - Net Revenues", "Manufacturing - # Enterprises", "Manufacturing - Net Revenues", etc., with the correct values in the correct columns?

dput(head(table_E.9))

returns:

structure(list(V1 = c("", "", "", "Long Term Arm", "", "Short Term Arm"
), V2 = c("Retail Trade", "# Enterprises Net Revenues", "(1) (2)", 
"3.89�\u0088\u0097�\u0088\u0097�\u0088\u0097 1601.42�\u0088\u0097", 
"[1.28] [824.74]", "2.34�\u0088\u0097�\u0088\u0097 464.60�\u0088\u0097"
), V3 = c("Manufacturing", "# Enterprises Net Revenues", "(3) (4)", 
"0.02 51.90", "[.27] [120.79]", "0.03 17.82"), V4 = c("Transportation", 
"# Enterprises Net Revenues", "(5) (6)", "0.53�\u0088\u0097 100.76", 
"[.29] [85.37]", "-0.12 -3.85"), V5 = c("Services", "# Enterprises Net Revenues", 
"(7) (8)", "0.23 198.64", "[.33] [205.6]", "-0.04 70.95")), row.names = c(NA, 
-6L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001716fd35930>)
1

There are 1 answers

0
int 21h -- Glory to Ukraine -- On BEST ANSWER

Below you will find the full quick and dirty solution.

However, if you need something more robust, you should optimize your scraping routine. Since the question was how to split multiple columns, the answer is - to use the dplyr function separate_wider_delim.

library(tidyverse)

df <- structure(list(V1 = c("", "", "", "Long Term Arm", "", "Short Term Arm"
), V2 = c("Retail Trade", "# Enterprises Net Revenues", "(1) (2)", 
          "3.89*** 601.42*", "[1.28] [824.74]", "2.34** 464.6*"
), V3 = c("Manufacturing", "# Enterprises Net Revenues", "(3) (4)", 
          "0.02 51.90", "[.27] [120.79]", "0.03 17.82"),
V4 = c("Transportation", "# Enterprises Net Revenues", "(5) (6)", "0.53* 100.76", 
"[.29] [85.37]", "-0.12 -3.85"),
V5 = c("Services", "# Enterprises Net Revenues", 
       "(7) (8)", "0.23 198.64", "[.33] [205.6]", "-0.04 70.95")),
row.names = c(NA, -6L), class = c("data.table", "data.frame"))

df_9 <- df[-3:-1,] |>
  separate_wider_delim(cols = V2:V5, names=LETTERS[1:2],
                       delim=" ", names_sep = "")

dfn <- df[1:3,] |>
  mutate(across(V2:V5, ~str_replace_all(.x, "s ", "s_")))|>
  mutate(across(V2:V5, ~str_replace_all(.x, "\\) ", "\\)_")))|>
separate_wider_delim(cols = V2:V5, names=LETTERS[1:2], names_sep = "",
                     delim="_", too_few = "align_start") 

df_names <- c("")
for (j in 2:9) {
  if(is.na(dfn[1,j])) {
    df_names <- c(df_names, paste(dfn[1,j-1], dfn[2,j],dfn[3,j]))
  } else {
    df_names <- c(df_names, paste(dfn[1,j],dfn[2,j],dfn[3,j]))
    }
}
names(df_9) <- df_names


> df_9|>as.data.frame()
                 Retail Trade # Enterprises (1) Retail Trade Net Revenues (2) Manufacturing # Enterprises (3) Manufacturing Net Revenues (4)
1  Long Term Arm                        3.89***                       601.42*                            0.02                          51.90
2                                        [1.28]                      [824.74]                           [.27]                       [120.79]
3 Short Term Arm                         2.34**                        464.6*                            0.03                          17.82
  Transportation # Enterprises (5) Transportation Net Revenues (6) Services # Enterprises (7) Services Net Revenues (8)
1                            0.53*                          100.76                       0.23                    198.64
2                            [.29]                         [85.37]                      [.33]                   [205.6]
3                            -0.12                           -3.85                      -0.04                     70.95
>