Looping multiple excel files with changing names using R

731 views Asked by At

I have multiple excel files named "Copy of 2003_BY_HR.xls", "Copy of 2004_BY_HR.xls"..."Copy of 2010_BY_HR.xls" etc. The file naming is unique and only the year changes in the name. The worksheets also have the same name. I am using readxl package to read in the data. I am able to read and output the data for each file by simply changing the year. I would like to efficiently achieve this automatically instead of manually changing the file name and re-running the script. My piece of script that works is shown below.

setwd("Data")
library(readxl)

# I read in the first file
dataf<-"Copy of 2003_BY_HR.xls"
ICA <- read_excel(dataf, 
              sheet = "ICA_HR_2003")
ET  <- read_excel(dataf, 
              sheet = "ET_HR_2003", na ="0")

# I read the data from first sheet and retrieve variable of interest
Grain_ICA         <- ICA$`Grain ICA`
Rice_ICA          <- ICA$`Rice ICA`
Cotton_ICA        <- ICA$`Cotton ICA`

# I read the data from second sheet and retrieve variable of interest
Grain_ET         <-ET$`Grain ET WA`
Rice_ET          <-ET$`Rice ET WA`
Cotton_ET        <-ET$`Cotton ET WA`

# I compute the results and save to a single file by appending
ET_grain  <- sum(Grain_ICA * Grain_ET * 1000, na.rm=T)
ET_rice   <- sum(Rice_ICA * Rice_ET *1000, na.rm=T)
ET_cotton <- sum(Cotton_ICA * Cotton_ET *1000, na.rm=T)

result <- data.frame( ET_grain,ET_rice,ET_cotton)
colnames(result) <- c("Grain","Rice","Cotton")
write.table(result, file = "ET.csv", append=T, sep=',', 
        row.names=F, 
        col.names=F)
2

There are 2 answers

1
Arktik On BEST ANSWER

You should use list.files to make a list of input files, you can use patterns to match file names. Then you put all your stuff above into a single function, and then use lapply to apply that function to the list of file names.

setwd("Data")
library(readxl)

# I read in the first file
files.lst <- list.files("./", pattern = "Copy of .*\\.xls")
files.lst


MyFun <- function(x) {
  dataf <- x
  ICA <- read_excel(dataf, 
                    sheet = "ICA_HR_2003")
  ET  <- read_excel(dataf, 
                    sheet = "ET_HR_2003", na ="0")

  # I read the data from first sheet and retrieve variable of interest
  Grain_ICA         <- ICA$`Grain ICA`
  Rice_ICA          <- ICA$`Rice ICA`
  Cotton_ICA        <- ICA$`Cotton ICA`

  # I read the data from second sheet and retrieve variable of interest
  Grain_ET         <-ET$`Grain ET WA`
  Rice_ET          <-ET$`Rice ET WA`
  Cotton_ET        <-ET$`Cotton ET WA`

  # I compute the results and save to a single file by appending
  ET_grain  <- sum(Grain_ICA * Grain_ET * 1000, na.rm=T)
  ET_rice   <- sum(Rice_ICA * Rice_ET *1000, na.rm=T)
  ET_cotton <- sum(Cotton_ICA * Cotton_ET *1000, na.rm=T)

  colnames(result) <- c("Grain","Rice","Cotton")
  result.file <- paste0(basename(dataf), ".result.csv")
  write.table(result, file = result.file, append=T, sep=',', 
              row.names=F, 
              col.names=F)

}

res <- lapply(files.lst, MyFun)

Generally, I would create a list objects and return results from the function rather than saving files inside that function. data.table is a great package, it has a method called rbindlist that can be used to convert your results list into a singe table, especially if they have the same columns. Easy to use with ggplot as well.

0
Jakub.Novotny On

I created a vector in which there need to be all years that are used as part of the names of the files. Then I changed what you do to be a function in which you can specify the input year. Finally, I loop through all years in the created vector:

setwd("Data")
library(readxl)

# here you need to list all years that appear in the names of the files
years <- c("2003", "2004")


myFunction <- function(.year){
  # I read in the first file
  dataf <- paste0("Copy of ", .year, "_BY_HR.xls")
  ICA <- read_excel(dataf, 
                    sheet = paste0("ICA_HR_", .year)
  ET  <- read_excel(dataf, 
                    sheet = paste0("ET_HR_", .year), na ="0")
  
  # I read the data from first sheet and retrieve variable of interest
  Grain_ICA         <- ICA$`Grain ICA`
  Rice_ICA          <- ICA$`Rice ICA`
  Cotton_ICA        <- ICA$`Cotton ICA`
  
  # I read the data from second sheet and retrieve variable of interest
  Grain_ET         <-ET$`Grain ET WA`
  Rice_ET          <-ET$`Rice ET WA`
  Cotton_ET        <-ET$`Cotton ET WA`
  
  # I compute the results and save to a single file by appending
  ET_grain  <- sum(Grain_ICA * Grain_ET * 1000, na.rm=T)
  ET_rice   <- sum(Rice_ICA * Rice_ET *1000, na.rm=T)
  ET_cotton <- sum(Cotton_ICA * Cotton_ET *1000, na.rm=T)
  
  result <- data.frame( ET_grain,ET_rice,ET_cotton)
  colnames(result) <- c("Grain","Rice","Cotton")
  write.table(result, file = "ET.csv", append=T, sep=',', 
              row.names=F, 
              col.names=F)
}

# this loops through all the years
for (year in seq_along(years)) {
  myFunction(year)
}