I am creating a panel data frame. It is a panel of schools. To this panel I want to merge the first closest weather station, then the second, third, etc until the 10th closest one. I wrote a loop that does this for different variables: maximum temperature, minimum temperature, precipitation, etc. The issue that I am having is that it seems that I am unnecessarily allocating memory somewhere inside this loop since I run out of memory.

I know I have enough memory to create the panel since I did it once already without the loop. I am working on windows on 64 bit with 8gb of RAM. I have a sample of 7800 schools, and 800 weather stations for the 2010-2015 period.

This is a reproducible example with only 5 schools, 10 weather stations and 2 months of data and matching only the 3 closest stations. The real example is 7800 schools, 800 weather stations, 5 years of data and matching the 10 closest stations.

library(data.table)
Dist_Temp_Max<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                          ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                          ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Temp_Min<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                          ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                          ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Prec<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                      ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                      ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))

years<-seq.Date(as.Date("2014-01-01"),as.Date("2015-02-28"),by="1 day")
Weather_Data<-data.frame(ID_School=seq(1:5))
Weather_Data<-expand.grid(Weather_Data$ID_School,years)
names(Weather_Data)<-c("ID_Station","Date")
Weather_Data$Temp_Max_T<-runif(nrow(Weather_Data), min=10, max=40)
Weather_Data$Temp_Min_T<-Weather_Data$Temp_Max-10
Weather_Data$Prec_T<-floor(runif(nrow(Weather_Data),min=0, max=10))
Weather_Data$Cod_Merge<-paste(Weather_Data$ID_Station,Weather_Data$Date,sep="-")

#Add Values per Station
var_list<-c("Temp_Max","Temp_Min","Prec")
for (i in var_list) {
  dist<-paste0("Dist_",i)
  dist<-get(dist)
  dist<-as.data.frame(subset(dist,!is.na(dist$ID_Station_1)))
  matr<-dist[c("ID_School","ID_Station_1","Dist_1")]
  matr<-setDT(matr)[, list(Date=years,ID_Station_1=ID_Station_1,Dist_1=Dist_1) , ID_School]

  matr$Cod_Merge<-paste(matr$ID_Station_1,matr$Date,sep="-")
  matr<-as.data.frame(matr[,c("Cod_Merge","ID_School","Date","ID_Station_1","Dist_1")])
  matr<-merge(matr,Weather_Data[c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
  matr$Cod_Merge<-paste(matr$ID_School,matr$Date,sep="-")
  names(matr)[6]<-paste0(i,"_T_1")
  Sys.sleep(0.1)
  print(i)

  for(n in 2:3) {
    matr2<-dist[c("ID_School",paste0("ID_Station_",n),paste0("Dist_",n))]
    matr2<-subset(dist,!is.na(dist[paste0("ID_Station_",n)]))
    matr3<-expand.grid(matr2$ID_School,years)

    names(matr3)<-c("ID_School","Date")
    matr3<-matr3[order(matr3$ID_School,matr3$Date), ]
    matr2<-merge(matr3,matr2,by="ID_School")
    rm(matr3)
    Sys.sleep(0.1)
    print(i)

    matr2$Cod_Merge<-paste(matr2[,paste0("ID_Station_",n)],matr2$Date,sep="-")
    matr2<-matr2[c("Cod_Merge","ID_School","Date",paste0("ID_Station_",n),paste0("Dist_",n))]
    matr2<-merge(matr2,Weather_Data[,c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
    matr2$Cod_Merge<-paste(matr2$ID_School,matr2$Date,sep="-")

    names(matr2)[6]<-paste0(i,"_T_",n)
    matr<-merge(matr,matr2[,c("Cod_Merge",
                              paste0("ID_Station_",n),
                              paste0("Dist_",n),
                              paste0(i,"_T_",n))],
                by="Cod_Merge",all.x=T)
    Sys.sleep(0.1)
    print(i)
  }
  assign(paste0("Mat_Dist_",i),matr)
}

Any help would be greatly appreciated.

Solution

For anyone who is interested, I was missing a couple of commas inside the 2nd loop:

library(data.table)
Dist_Temp_Max<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                          ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                          ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Temp_Min<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                          ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                          ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Prec<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
                      ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
                      ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))

years<-seq.Date(as.Date("2014-01-01"),as.Date("2015-02-28"),by="1 day")
Weather_Data<-data.frame(ID_School=seq(1:5))
Weather_Data<-expand.grid(Weather_Data$ID_School,years)
names(Weather_Data)<-c("ID_Station","Date")
Weather_Data$Temp_Max_T<-runif(nrow(Weather_Data), min=10, max=40)
Weather_Data$Temp_Min_T<-Weather_Data$Temp_Max-10
Weather_Data$Prec_T<-floor(runif(nrow(Weather_Data),min=0, max=10))
Weather_Data$Cod_Merge<-paste(Weather_Data$ID_Station,Weather_Data$Date,sep="-")

#Add Values per Station
var_list<-c("Temp_Max","Temp_Min","Prec")
for (i in var_list) {
  dist<-paste0("Dist_",i)
  dist<-get(dist)
  dist<-as.data.frame(subset(dist,!is.na(dist$ID_Station_1)))
  matr<-dist[c("ID_School","ID_Station_1","Dist_1")]
  matr<-setDT(matr)[, list(Date=years,ID_Station_1=ID_Station_1,Dist_1=Dist_1) , ID_School]

  matr$Cod_Merge<-paste(matr$ID_Station_1,matr$Date,sep="-")
  matr<-as.data.frame(matr[,c("Cod_Merge","ID_School","Date","ID_Station_1","Dist_1")])
  matr<-merge(matr,Weather_Data[c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
  matr$Cod_Merge<-paste(matr$ID_School,matr$Date,sep="-")
  names(matr)[6]<-paste0(i,"_T_1")
  Sys.sleep(0.1)
  print(i)

  for(n in 2:3) {
    matr2<-dist[c("ID_School",paste0("ID_Station_",n),paste0("Dist_",n))]
    matr2<-subset(dist,!is.na(dist[paste0("ID_Station_",n)]))
    matr3<-expand.grid(matr2$ID_School,years)

    names(matr3)<-c("ID_School","Date")
    matr3<-matr3[order(matr3$ID_School,matr3$Date), ]
    matr2<-merge(matr3,matr2,by="ID_School")
    rm(matr3)
    Sys.sleep(0.1)
    print(i)

    matr2$Cod_Merge<-paste(matr2[,paste0("ID_Station_",n)],matr2$Date,sep="-")
    matr2<-matr2[,c("Cod_Merge","ID_School","Date",paste0("ID_Station_",n),paste0("Dist_",n))]
    matr2<-merge(matr2,Weather_Data[,c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
    matr2$Cod_Merge<-paste(matr2$ID_School,matr2$Date,sep="-")

    names(matr2)[6]<-paste0(i,"_T_",n)
    matr<-merge(matr,matr2[,c("Cod_Merge",
                              paste0("ID_Station_",n),
                              paste0("Dist_",n),
                              paste0(i,"_T_",n))],
                by="Cod_Merge",all.x=T)
    Sys.sleep(0.1)
    print(i)
  }
  assign(paste0("Mat_Dist_",i),matr)
}

1 Answers

0
Evan Friedland On Best Solutions

It seems that all your code would need to do is find the 10 closest stations to each school, and then you simply subset the station data to the school (don't know anything about your dates).

Your final data frame should be way better and easier to use -- probably instead of 3 separate wide data frames it should look like this:

set.seed(1)  # FAKE DATA
final <- data.frame(ID_School = rep(LETTERS[1],10), ID_Station = sample(1:100,10), 
                   Closeness_Rank = 1:10, Distance = 10*(1:10) + sample(-5:5,10), 
                   Temp.Max = sample(70:100,10), Temp.Min = sample(30:69,10), 
                   Precipitation = sample(20:30,10)/100)
final
#   ID_School ID_Station Closeness_Rank Distance Temp.Max Temp.Min Precipitation
#1          A         27              1        7       98       49          0.29
#2          A         37              2       16       76       53          0.26
#3          A         57              3       31       88       48          0.27
#4          A         89              4       38       73       36          0.24
#5          A         20              5       50       77       59          0.23
#6          A         86              6       65       80       68          0.28
#7          A         97              7       72       70       57          0.20
#8          A         62              8       79       79       33          0.21
#9          A         58              9       94       90       64          0.22
#10         A          6             10      103       96       42          0.30

Without knowing how you measure your distances for the station and school data or other information, I can't help you get to this format but if you provide more information I would be happy to help.

EDIT:

This method seems to be very slow as I'm not really using data.tables correctly but hopefully it should give you some ideas. I've generated the fake data in a way that might be useful for you for explaining your question in the future. My method is to only build the FINAL output, a day-school data.table of weather data derived from averaging the closest 10 stations that have data weighted by inverse distance.

The process is super slow, at ~ 7800 school weather's calculated in 5 minutes for a single day... so 6 and half days to complete for 5 years -- but no memory issues! This is the kind of code you would post and ask if someone can improve the speed.

# Starting from the beginning
set.seed(100)
library(data.table)

n_station <- 800
n_school <- 7800
station_info <- data.frame(ID_Station = 1:n_station, 
           xcoord = sample(-10000:10000,n_station), 
           ycoord = sample(-10000:10000,n_station))

school_info <- data.frame(ID_School = 1:n_school, 
           xcoord = sample(-10000:10000,n_school), 
           ycoord = sample(-10000:10000,n_school))

# save list of ~20 closest stations by school, 
# and always use 10 of the closest where measurements are available
x <- 20 
L <- vector('list', nrow(school_info)) # always initialize for speed
for(i in 1:nrow(school_info)){
    distances <- sqrt((school_info[i,"xcoord"] - station_info[,"xcoord"])^2 + 
                      (school_info[i,"ycoord"] - station_info[,"ycoord"])^2)
    L[[i]] <- cbind.data.frame(ID_School = rep(school_info[i,"ID_School"],x),
                               ID_Station = station_info[ which(order(distances) <= x), 
                                                         "ID_Station"],
                               Distance_Rank = 1:x,
                               Distance = sort(distances)[1:x])
}
L[[1]]
#        ID_School ID_Station Distance_Rank  Distance
# 1:         1          2             1  127.2242
# 2:         1         32             2  365.7896
# 3:         1         92             3  573.0428
# 4:         1        141             4  763.5837
# 5:         1        151             5 1003.4127

For 5 years of daily Fake Weather Data:

days <- seq.Date(as.Date("2010-01-01"),as.Date("2015-12-31"),by="1 day")
d <- length(days)
S <- vector('list', nrow(station_info))
for(i in 1:nrow(station_info)){
  S[[i]] <- data.frame(ID_Station = rep(station_info[i,"ID_Station"],d),
                       Temp.Max = sample(70:100,d,T),
                       Temp.Min = sample(30:69,d,T), 
                       Precipitation = sample(20:30,d,T)/100,
                       date = days)
  # maybe remove some dates at random
  if(sample(c(T,F),1)) S[[i]] <- S[[i]][-sample(1:d,1),]
}
station_data <- as.data.table(do.call(rbind,S))
station_data
#        ID_Station Temp.Max Temp.Min Precipitation       date
#     1:          1       88       55          0.23 2010-01-01
#     2:          1       73       57          0.24 2010-01-02
#     3:          1       93       33          0.29 2010-01-03
#     4:          1       81       52          0.27 2010-01-04
#     5:          1       82       48          0.24 2010-01-05
#    ---                                                      
#291610:        800       86       31          0.28 2010-12-27
#291611:        800       98       57          0.22 2010-12-28
#291612:        800       71       50          0.26 2010-12-29
#291613:        800       83       35          0.26 2010-12-30
#291614:        800       71       34          0.23 2010-12-31

The algorithm:

size <- length(days) * n_school
#OUT <- data.table(ID_School = integer(size),
#                  date = as.Date(x = integer(size), origin = "1970-01-01"),
#                  wtd_Temp.Max= numeric(size),
#                  wtd_Temp.Min= numeric(size),
#                  wtd_Precip= numeric(size))
OUT <- vector('list',size) # faster

unique_school <- unique(school_data$ID_School) # will be length(n_school)
#length(L) is the same as length(unique(school)= n_school)

count = 0
for(i in 1:length(days)){
  t1 <- Sys.time()
  temp_weather_data = station_data[date==days[i],]
  m <- merge(school_data, temp_weather_data, "ID_Station")
setkey(m, ID_School) # the key is ID_School
  for(j in 1:length(unique_school)){
    count = count + 1
    # assuming within the closest 20 stations, at least 10 have data every day
    r <- m[.(j),][1:10] # find schools j in key
    invd <- 1/r$Distance
    sum.invd <- sum(invd)
    OUT[[count]] <- data.table(ID_School = unique_school[j], 
                               date = days[i], 
                               wtd_Temp.Max = sum(invd * r$Temp.Max)/sum.invd,
                               wtd_Temp.Min = sum(invd * r$Temp.Min)/sum.invd,
                               wtd_Precip = sum(invd * r$Precipitation)/sum.invd)
  if(j %% 100 == 0) cat(as.character(days[i]),".....",unique_school[j],"...\n")
  }
  cat(Sys.time()-t1)
}

Which gives the final output:

do.call(rbind,OUT)
#    ID_School       date wtd_Temp.Max wtd_Temp.Min wtd_Precip
# 1:         1 2010-01-01     88.64974     44.07872  0.2757571
# 2:         2 2010-01-01     83.34549     46.80225  0.2511073
# 3:         3 2010-01-01     85.32834     48.62004  0.2347837
# 4:         4 2010-01-01     82.95667     48.01814  0.2576482
# 5:         5 2010-01-01     87.88982     44.45357  0.2527794
# ---