I have a dataframe including all observation of distances (column DIST.y) and altitude (Z) and corrected altitude by moving average (ma_Z) encompassed within an interval fixed by the column LimAm and LimAv for a given point ID.x for a given distance Dist.x. My purpose do a regression of the DIST.y and the ma_Z for each group of ID.x. Here is the code I use:
df_sl %>%
do({
mod = lm(ma_Z ~ DIST.y, data = .)
data.frame(int = coef(mod)[1], slope = coef(mod)[2])
})
This code works properly.
However, I would like to fill the NA in the column ma_Z by using an interpolation with the funtion zoo::na.approx() before doing the regression.
Unfortunately, for some of the groups ID.x in the dataframe I have a error message saying there is not enough non-NA values for the code to works properly (in this example, the groups ID.x = "188473" and "188473").
I do not understand why I have this error message for the ID.x "188473" and "188474" having two non-NA values. While ID.x "9383" and "9384" does not trigger the error message but only have one non-NA value each.
df_sl <- structure(list(ID.x = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5,
5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 9383, 9383, 9384, 9384,
188473, 188473, 188473, 188473, 188474, 188474, 188474, 188474
), DIST.x = c(0, 0, 0, 0, 0, 0, 0, 0, 11.515675154, 11.515675154,
11.515675154, 11.515675154, 11.515675154, 11.515675154, 11.515675154,
11.515675154, 21.823439218, 21.823439218, 21.823439218, 21.823439218,
21.823439218, 21.823439218, 21.823439218, 21.823439218, 21.988363443,
21.988363443, 21.988363443, 21.988363443, 21.988363443, 21.988363443,
21.988363443, 21.988363443, 32.961571068, 32.961571068, 32.961571068,
32.961571068, 32.961571068, 32.961571068, 32.961571068, 32.961571068,
43.934778692, 43.934778692, 43.934778692, 43.934778692, 43.934778692,
43.934778692, 43.934778692, 43.934778692, 0, 0, 13.891845289,
13.891845289, 0, 0, 0, 0, 0, 0, 0, 0), ID.y = c(1, 2, 3, 4, 5,
6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2,
3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,
8, 9383, 9384, 9383, 9384, 188473, 188474, 188475, 188476, 188473,
188474, 188475, 188476), LimAm = c(-375, -363.484324846, -353.176560782,
-353.011636557, -342.038428932, -331.065221308, -330.991532192,
-320.018324568, -375, -363.484324846, -353.176560782, -353.011636557,
-342.038428932, -331.065221308, -330.991532192, -320.018324568,
-375, -363.484324846, -353.176560782, -353.011636557, -342.038428932,
-331.065221308, -330.991532192, -320.018324568, -375, -363.484324846,
-353.176560782, -353.011636557, -342.038428932, -331.065221308,
-330.991532192, -320.018324568, -375, -363.484324846, -353.176560782,
-353.011636557, -342.038428932, -331.065221308, -330.991532192,
-320.018324568, -375, -363.484324846, -353.176560782, -353.011636557,
-342.038428932, -331.065221308, -330.991532192, -320.018324568,
-375, -361.108154711, -375, -361.108154711, -375, -375, -375,
-362.193750234, -375, -375, -375, -362.193750234), LimAv = c(375,
386.515675154, 396.823439218, 396.988363443, 407.961571068, 418.934778692,
419.008467808, 429.981675432, 375, 386.515675154, 396.823439218,
396.988363443, 407.961571068, 418.934778692, 419.008467808, 429.981675432,
375, 386.515675154, 396.823439218, 396.988363443, 407.961571068,
418.934778692, 419.008467808, 429.981675432, 375, 386.515675154,
396.823439218, 396.988363443, 407.961571068, 418.934778692, 419.008467808,
429.981675432, 375, 386.515675154, 396.823439218, 396.988363443,
407.961571068, 418.934778692, 419.008467808, 429.981675432, 375,
386.515675154, 396.823439218, 396.988363443, 407.961571068, 418.934778692,
419.008467808, 429.981675432, 375, 388.891845289, 375, 388.891845289,
375, 375, 375, 387.806249766, 375, 375, 375, 387.806249766),
DIST.y = c(0, 11.515675154, 21.823439218, 21.988363443, 32.961571068,
43.934778692, 44.008467808, 54.981675432, 0, 11.515675154,
21.823439218, 21.988363443, 32.961571068, 43.934778692, 44.008467808,
54.981675432, 0, 11.515675154, 21.823439218, 21.988363443,
32.961571068, 43.934778692, 44.008467808, 54.981675432, 0,
11.515675154, 21.823439218, 21.988363443, 32.961571068, 43.934778692,
44.008467808, 54.981675432, 0, 11.515675154, 21.823439218,
21.988363443, 32.961571068, 43.934778692, 44.008467808, 54.981675432,
0, 11.515675154, 21.823439218, 21.988363443, 32.961571068,
43.934778692, 44.008467808, 54.981675432, 0, 13.891845289,
0, 13.891845289, 0, 0, 0, 12.806249766, 0, 0, 0, 12.806249766
), Z = c(193.07513428, 193.15454102, 192.17289734, 192.17289734,
190.82974243, 190.63618469, 190.63618469, 189.45043945, 193.07513428,
193.15454102, 192.17289734, 192.17289734, 190.82974243, 190.63618469,
190.63618469, 189.45043945, 193.07513428, 193.15454102, 192.17289734,
192.17289734, 190.82974243, 190.63618469, 190.63618469, 189.45043945,
193.07513428, 193.15454102, 192.17289734, 192.17289734, 190.82974243,
190.63618469, 190.63618469, 189.45043945, 193.07513428, 193.15454102,
192.17289734, 192.17289734, 190.82974243, 190.63618469, 190.63618469,
189.45043945, 193.07513428, 193.15454102, 192.17289734, 192.17289734,
190.82974243, 190.63618469, 190.63618469, 189.45043945, 353.19342041,
353.02838135, 353.19342041, 353.02838135, 344.16003418, 344.16003418,
344.16003418, 344.1892395, 344.16003418, 344.16003418, 344.16003418,
344.1892395), ma_Z = c(193.07513428, NA, 192.800857546667,
NA, 192.0825195325, NA, 191.0687522875, NA, 193.07513428,
NA, 192.800857546667, NA, 192.0825195325, NA, 191.0687522875,
NA, 193.07513428, NA, 192.800857546667, NA, 192.0825195325,
NA, 191.0687522875, NA, 193.07513428, NA, 192.800857546667,
NA, 192.0825195325, NA, 191.0687522875, NA, 193.07513428,
NA, 192.800857546667, NA, 192.0825195325, NA, 191.0687522875,
NA, 193.07513428, NA, 192.800857546667, NA, 192.0825195325,
NA, 191.0687522875, NA, 353.19342041, NA, 353.19342041, NA,
344.16003418, NA, 344.16003418, NA, 344.16003418, NA, 344.16003418,
NA)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -60L), groups = structure(list(ID.x = c(1,
2, 3, 4, 5, 6, 9383, 9384, 188473, 188474), .rows = structure(list(
1:8, 9:16, 17:24, 25:32, 33:40, 41:48, 49:50, 51:52, 53:56,
57:60), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L), .drop = TRUE))
df_sl %>% group_by(ID.x) %>% mutate(ma_Z = zoo::na.approx(ma_Z, DIST.y, na.rm = FALSE, rule = 2))
And the error that I get :
Error in `mutate()`:
ℹ In argument: `ma_Z = ifelse(...)`.
ℹ In group 9: `ID.x = 188473`.
Caused by error in `approx()`:
! need at least two non-NA values to interpolate
I still need the groups with NA values for the linear regression (it works even if there is only one non-NA values). Also the full dataset is composed of millions of observations. I cannot allow myself to check all of it.
I had the idea to use a ifelse() function to bypass the interpolation if the amount of non-NA is lower than 2. However the error is the same. If I rise the threshold for the condition (>2), there is no errors but it gives an unique value for all observations of each group (No interpolation then)...
df_sl %>%
group_by(ID.x) %>%
mutate(ma_Z = ifelse(
length(na.omit(ma_Z)) > 1,
zoo::na.approx(ma_Z, DIST.y, na.rm = FALSE, rule = 2),
ma_Z)
)
How could I fix the error for the whole dataset and obtain a proper interpolation in the ma_Z column?
Is this what you want?