interpolate and extrapolate missing columns in a data frame

2k views Asked by At

I have a data frame containing data for every 4 or 5 years. I wish to interpolate data for years not in the data frame and extrapolation data for columns at either end of the data frame.

I have been able to perform the interpolation with the code below. The only issue is that the middle columns get repeated and one copy must be removed. Is there a more efficient way of conducting the interpolation? I am also unsure how to attack the extrapolation. The actual data set contains 12 years (columns) of available data.

Thank you for any advice.

my.data <- read.table(text = '
    y1980  y1985  y1990
     0.10   0.20   0.40
     1.00   2.00   4.00
    10.00  20.00  40.00
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

desired.result <- read.table(text = '
    y1978 y1979 y1980 y1981 y1982 y1983 y1984 y1985 y1986 y1987 y1988 y1989 y1990 y1991 y1992
     0.06  0.08  0.10  0.12  0.14  0.16  0.18  0.20  0.24  0.28  0.32  0.36  0.40  0.44  0.48
     0.60  0.80  1.0   1.2   1.4   1.6   1.8   2.0   2.4   2.8   3.2   3.6   4.0   4.4   4.8
     6     8    10    12    14    16    18    20    24    28    32    36    40    44    48
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)
desired.result

# reshape data to form two columns
new.data  <- reshape(my.data, direction="long", 
                     varying = list(seq(1,(ncol(my.data)-1),1), seq(2,(ncol(my.data)-0),1)), 
                     v.names=c("v1", "v2"))

# interpolate every row
interpol  <- t(apply(new.data[,2:3], 1, function(x) approx(x, n = 6)$y))
new.data2 <- data.frame(time = new.data$time, interpol, id = new.data$id)

# reform row:column structure 
my.data2  <- reshape(new.data2, idvar="id", timevar = "time", direction = "wide")

# middle columns are repeated and must be removed
my.data3  <- my.data2[, !names(my.data2) %in% c("X1.2")]
my.data3

    id X1.1  X2.1  X3.1  X4.1  X5.1 X6.1  X2.2  X3.2  X4.2  X5.2 X6.2
1.1  1  0.1  0.12  0.14  0.16  0.18  0.2  0.24  0.28  0.32  0.36  0.4
2.1  2  1.0  1.20  1.40  1.60  1.80  2.0  2.40  2.80  3.20  3.60  4.0
3.1  3 10.0 12.00 14.00 16.00 18.00 20.0 24.00 28.00 32.00 36.00 40.0

A possible alternative for interpolation that does not work:

sapply( seq(1, (ncol(my.data)-1), 1), function(i) {approx(c(my.data[,i], my.data[,i+1]), n = 6)$y } )
2

There are 2 answers

0
Ferdinand.kraft On BEST ANSWER

Here is one alternative formulation.

First a useful function:

tvseq <- function(...)t(Vectorize(seq.default)(...))

Now for interpolation:

years <- as.numeric(gsub("y","",names(my.data)))

d <- diff(years)

L <- lapply(seq(d), function(i) tvseq(from=my.data[,i], to=my.data[,i+1], length.out=d[i]+1)[,-1])

result <- cbind(my.data[,1], do.call(cbind, L))
colnames(result) <- paste0("y",min(years):max(years))

Result:

> result
     y1980 y1981 y1982 y1983 y1984 y1985 y1986 y1987 y1988 y1989 y1990
[1,]   0.1  0.12  0.14  0.16  0.18   0.2  0.24  0.28  0.32  0.36   0.4
[2,]   1.0  1.20  1.40  1.60  1.80   2.0  2.40  2.80  3.20  3.60   4.0
[3,]  10.0 12.00 14.00 16.00 18.00  20.0 24.00 28.00 32.00 36.00  40.0

To add extrapolation, use this:

ylow <- 1978:(min(years)-1)
low <- tvseq(to=result[,1], by=result[,2]-result[,1], length.out=length(ylow)+1)[,1:length(ylow)]
colnames(low) <- paste0("y",ylow)

yhigh <- (max(years)+1):1992
high <- tvseq(from=result[,ncol(result)], by=result[,ncol(result)]-result[,ncol(result)-1], length.out=length(yhigh)+1)[,-1]
colnames(high) <- paste0("y",yhigh)

cbind(low, result, high)

Result:

     y1978 y1979 y1980 y1981 y1982 y1983 y1984 y1985 y1986 y1987 y1988 y1989 y1990 y1991 y1992
[1,]  0.06  0.08   0.1  0.12  0.14  0.16  0.18   0.2  0.24  0.28  0.32  0.36   0.4  0.44  0.48
[2,]  0.60  0.80   1.0  1.20  1.40  1.60  1.80   2.0  2.40  2.80  3.20  3.60   4.0  4.40  4.80
[3,]  6.00  8.00  10.0 12.00 14.00 16.00 18.00  20.0 24.00 28.00 32.00 36.00  40.0 44.00 48.00
0
Henrik On

An alternative for inter- and extrapolation:

library(zoo)
df <- data.frame(t(my.data))
df$yr <- as.numeric(substring(rownames(df), first = 2))
z1 <- zoo(df, order.by = df$yr, frequency = 1)
t1 <- as.ts(x = z1)
t2 <- na.approx(t1)
future <- apply(t2, 2, function(x) tail(x, 1) + diff(tail(x, 2)) * 1:2)
past <- apply(t2, 2, function(x) head(x, 1) - diff(head(x, 2)) * 1:2)
t3 <- rbind(past, t2, future)
t3 <- t3[order(t3[ , "yr"]), ]
t4 <- t(t3)[1:3, ]
colnames(t4) <- paste0("y", t3[ , "yr"])
t4