Inaccurate, Random Probabilities in Raster::Predict Cforest calculation with factor variables using type="prob"

104 views Asked by At

I am having an issue with the raster::predict function related generating probability predictions of Presence with a cforest object.

When I generate a classification output, I obtain a correct output - however, when adding type="prob" to obtain probabilities, the output is a strange banding that doesn't correspond to the classification output. I have attached two photos here: Correct output based on classification predict and Banded, incorrect output based on type="prob" predict that details these issues.

As you will notice, the checkerboard pattern on the output places random areas of high and low probability randomly, whereas the classification output creates blocks of presence (1) and absence (0) where they would be expected.

Below is a snippit of code, with training data and a segment of a raster brick used for prediction.

I would greatly appreciate if anyone has insight into how I can obtain valid probabilities using this function.

Am I missing something important in the predfun function?

Thank you in advance!

Here is the example data:

library(raster)
training = structure(list(
 Presence = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"), 
 Elevation = c(3937.63541666689, 384.003472222217, 401.357638888879, 226.43749999999, 21.3055555555572, 305.399305555546, 38.3402777777742, 347.302083333335, 168.156250000001, 700.708333333328, 1034.2013888889, 1033.78125, 1426.99305555577, 912.874999999952, 665.854166666672, 657.187499999983, 1181.97916666667, 696.062499999948, 976.812500000002, 1017.98263888889), 
 Region = structure(c(3L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 4L, 4L, 4L, 3L, 3L, 1L, 4L, 2L, 4L, 4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
 Protected = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("0","1"), class = "factor"), 
 Population = structure(c(17L, 3L, 4L, 1L, 13L, 2L, 5L, 2L, 9L, 13L, 6L, 5L, 5L, 7L, 8L, 14L, 1L, 13L, 7L, 1L), .Label = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"), class = "factor"), 
 Mean_Diunal_Range = c(126,151, 152, 153, 138, 125, 137, 158, 129, 137, 170, 172, 115,151, 150, 178, 149, 146, 158, 165), 
 Isothermality = c(31,49, 46, 49, 54, 51, 59, 48, 47, 62, 54, 54, 61, 37, 47, 63,61, 75, 55, 57), 
 Mean_Temp_Wettest_Q = c(-87, 338, 348, 135,236, 193, 305, 322, 247, 253, 249, 247, 210, 105, 313, 252,238, 267, 250, 255), 
 Mean_Temp_Driest_Q = c(64, 231, 253,230, 324, 194, 263, 261, 328, 236, 133, 130, 158, 315, 188,232, 173, 248, 166, 151), 
 Precip_Wettest_M = c(119, 30, 18,3, 16, 8, 22, 12, 11, 165, 62, 59, 71, 13, 11, 201, 85, 110,121,91), 
 Precip_Dryest_m = c(2, 0, 0, 0, 0, 0, 0, 0, 0,0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0), 
 Precip_Seasonality = c(81,133, 136, 65, 106, 65, 126, 85, 101, 110, 77, 76, 111, 85,111, 125, 116, 123, 106, 93), 
 Precip_Warmest_Q = c(19, 55,29, 0, 0, 6, 49, 23, 1, 152, 148, 146, 56, 0, 15, 143, 63,140, 174, 195), 
 Precip_Coldest_Q = c(201, 0, 1, 8, 14, 14,5, 6, 17, 0, 9, 10, 0, 29, 0, 0, 2, 0, 1, 2), 
 Agriculture_Date = c(9000,3000, 3000, 7200, 7600, 3500, 3500, 3500, 7600, 5000, 1000,1700, 1250, 9500, 4000, 2700, 1250, 4000, 1000, 1000), 
 distance_to_highways = c(0.697535828668298,3.21145962385929, 4.70553674201054, 1.26025341029057, 0.0306302671744576,1.29795751420045, 0.341335382947427, 2.01515485126633, 1.90935593837289,1.00778111672525, 0.0841242925270876, 0.0135315745860043,2.39038097221274, 1.49284290327056, 1.21019159581485, 5.71817373942967,1.64045219527117, 0.121375728043842, 0.535675418474612, 1.08581690073317)), 
row.names = c(NA, -20L), class = c("data.table", "data.frame"))

##Create Prediction Raster Brick

## Create Data Frame for Raster Brick
data_segment <- structure(list(
 Elevation = c(1187.9, 1173.5, 1158.2, 1143.3, 1125.6, 1112.4, 1232.7, 1203.3, 1176.7, 1156.8, 1138.9, 1140.8, 1249.9, 1216.9, 1193.2, 1171.4, 1153.5, 1157.6, 1261.5, 1233.2, 1208.2, 1185.6, 1175.5, 1184.2, 1289.1, 1256.4, 1240.7, 1212.2, 1208.1, 1220.9, 1304.6, 1297.2, 1286.6, 1249.2, 1231, 1256.1, 1341.5, 1329.3, 1328.5, 1291.8, 1250.2, 1288.6, 1379.2, 1322.8, 1305.6, 1293.7, 1234.9, 1260.5, 1354.5, 1274.5, 1257.7, 1281.1, 1230.8, 1196.3, 1288.5, 1237.4, 1221.5, 1241.6, 1212.7, 1162.3, 1199.4, 1185.4, 1194.5, 1216.1, 1207.4, 1149.8, 1150.6, 1157, 1176, 1176.1, 1155.6, 1138, 1145, 1131.9, 1121.1, 1116.3, 1109.4, 1116.2), 
 Region = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), 
 Protected = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
 Population = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 11, 11, 12, 12, 12, 15, 11, 11, 12, 12, 12, 15, 11, 11, 12, 12, 12, 16, 11, 11, 12, 12, 12, 16, 11, 11, 12, 12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12), 
 Mean_Diunal_Range = c(10.5, 10.5, 10.6, 10.7, 10.8, 10.9, 10.4, 10.5, 10.5, 10.5, 10.6, 10.6, 10.3, 10.4, 10.5, 10.5, 10.5, 10.4, 10.2, 10.3, 10.3, 10.4, 10.3, 10.4, 10.2, 10.2, 10.2, 10.2, 10.3, 10.3, 10.3, 10.2, 10.1, 10.2, 10.3, 10.3, 10.3, 10.2, 10.2, 10.2, 10.2, 10.1, 10.2, 10.3, 10.2, 10.1, 10.1, 10.1, 10.1, 10.2, 10.2, 10, 10, 10.1, 10.1, 10.1, 10, 9.9, 9.9, 10, 10, 10, 9.9, 9.9, 9.9, 9.9, 10, 9.9, 9.8, 9.9, 9.8, 9.8, 10.1, 10.1, 10, 9.8, 9.8, 9.7), 
 Isothermality = c(68.5, 68.7, 69.3, 69.4, 69.6, 69.7, 68.4, 68.5, 68.8, 68.7, 69.2, 69.2, 67.8, 68.1, 68.4, 68.6, 68.6, 68.3, 67.4, 67.4, 67.6, 67.9, 68, 68.1, 67.2, 66.8, 66.9, 67.3, 67.5, 67.3, 67.2, 66.4, 66.6, 66.9, 66.7, 67, 66.7, 66.7, 66.5, 66.4, 66.5, 66.2, 65.7, 66.3, 66, 65.7, 65.5, 65.7, 65.4, 65.1, 65.1, 65.2, 65.2, 65.3, 64.2, 64.5, 64.5, 64.1, 64.6, 64.7, 63.8, 64, 64.1, 63.7, 63.8, 64.2, 63.4, 63.7, 63.4, 63.6, 63.5, 63.7, 63.8, 64.1, 63.5, 63, 63.2, 63.1), 
 Mean_Temp_Wettest_Q = c(24.5, 24.5, 24.7, 24.8, 25, 25, 24.2, 24.4, 24.5, 24.7, 24.8, 24.8, 24.1, 24.3, 24.5, 24.6, 24.7, 24.8, 24, 24.2, 24.4, 24.5, 24.6, 24.7, 24, 24.1, 24.2, 24.4, 24.5, 24.5, 23.9, 24, 24, 24.3, 24.5, 24.4, 23.8, 23.8, 23.9, 24, 24.3, 24.1, 23.5, 23.9, 24, 24, 24.3, 24.3, 23.7, 24.1, 24.2, 24.1, 24.4, 24.6, 24, 24.4, 24.4, 24.3, 24.5, 24.8, 24.5, 24.6, 24.5, 24.5, 24.5, 24.9, 24.8, 24.8, 24.6, 24.7, 24.8, 24.9, 24.8, 25, 25, 25, 25, 25), 
 Mean_Temp_Driest_Q = c(22.4, 22.4, 22.5, 22.7, 22.8, 22.9, 22.1, 22.3, 22.4, 22.5, 22.7, 22.7, 22, 22.2, 22.4, 22.5, 22.6, 22.6, 21.9, 22.1, 22.3, 22.4, 22.5, 22.5, 21.8, 22, 22, 22.2, 22.3, 22.3, 21.7, 21.7, 21.8, 22.1, 22.2, 22.1, 21.5, 21.5, 21.6, 21.8, 22, 21.8, 21.2, 21.6, 21.7, 21.7, 22, 21.9, 21.3, 21.8, 21.9, 21.8, 22, 22.2, 21.6, 21.9, 22, 21.9, 22, 22.3, 22, 22.1, 22, 22, 22, 22.4, 22.2, 22.2, 22.1, 22.2, 22.2, 22.4, 22.3, 22.8, 22.4, 22.5, 22.5, 22.5),
 Precip_Wettest_M = c(161, 160, 161, 161, 159, 158, 163, 162, 163, 163, 162, 161, 166, 165, 164, 164, 163, 163, 169, 167, 166, 166, 166, 166, 172, 169, 169, 168, 169, 169, 173, 172, 173, 171, 170, 170, 174, 176, 176, 174, 171, 173, 179, 175, 175, 175, 171, 173, 180, 176, 174, 176, 172, 171, 181, 177, 176, 177, 174, 170, 177, 176, 176, 177, 175, 170, 175, 175, 177, 178, 175, 174, 176, 175, 175, 175, 173, 174), 
 Precip_Dryest_m = c(6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 7, 6, 7, 6, 6, 6, 7, 7, 7, 6, 6, 6, 8, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), 
 Precip_Seasonality = c(100.8, 101.7, 103, 105.2, 105.9, 107.2, 99.8, 101.8, 103.4, 105.5, 106.3, 106.8, 101.1, 101.8, 103.4, 105.1, 106.2, 106.8, 100.8, 102, 103.1, 105.3, 106.6, 107.6, 101.4, 102.5, 103.8, 105.3, 106.9, 107.2, 100.8, 102.2, 102.7, 104.9, 106.3, 106.6, 100.1, 101.3, 102.6, 104.1, 105.7, 106.3, 98.8, 101.3, 101.9, 103.6, 104.8, 105.8, 99.2, 102, 102.6, 103.5, 104.5, 106.4, 100.1, 102.4, 103.8, 104.2, 105, 106.8, 100.7, 103, 103.6, 104.1, 104.7, 105.8, 101.2, 102.2, 103.9, 104.6, 105.6, 106.7, 101.3, 102.5, 103.4, 104.2, 105.5, 106.7), 
 Precip_Warmest_Q = c(97, 96, 94, 89, 86, 84, 100, 97, 94, 90, 87, 86, 100, 99, 95, 91, 88, 87, 102, 99, 96, 92, 89, 87, 102, 98, 96, 92, 91, 90, 105, 100, 98, 95, 92, 92, 106, 104, 101, 97, 93, 93, 109, 101, 101, 97, 92, 93, 108, 100, 97, 97, 93, 89, 105, 99, 96, 96, 93, 89, 101, 98, 96, 95, 94, 89, 98, 97, 96, 94, 91, 89, 99, 96, 94, 93, 90, 88), 
 Precip_Coldest_Q = c(23, 22, 21, 21, 21, 19, 25, 22, 22, 21, 21, 20, 24, 23, 21, 21, 21, 20, 25, 23, 22, 21, 21, 21, 25, 23, 22, 21, 21, 21, 25, 24, 25, 22, 21, 21, 27, 27, 25, 23, 22, 22, 30, 27, 26, 25, 24, 23, 30, 27, 26, 26, 24, 23, 30, 27, 26, 26, 25, 21, 28, 26, 26, 26, 25, 24, 28, 26, 26, 26, 25, 24, 28, 27, 26, 26, 24, 24), 
 Agriculture_Date = c(4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000, 4000), 
 distance_to_highways = c(0.1, 0, 0, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.2, 0.1, 0, 0, 0.1, 0.1, 0.1, 0.1, 0, 0, 0.1, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0, 0, 0, 0, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.1, 0, 0, 0.1, 0.1, 0.1, 0.1, 0, 0, 0, 0.1, 0.1, 0.2, 0, 0, 0, 0.1, 0.1, 0.2, 0, 0, 0, 0.1, 0.1, 0.2)), 
class = "data.frame", row.names = c(NA, -78L))

## Create RasterBrick and assign values from dataframe
segment_brick <- brick(nrow=13, ncol=6, xmn=39, xmx=39.25, ymn=3.833333, ymx=4.375,crs="+proj=longlat +datum=WGS84 +no_defs", nl=15)
values(segment_brick) <-  as.matrix(data_segment)

Now the modeling:

library(party)
##Fit Cforest Model
cforest_example = party::cforest(Presence ~ ., data = training,
            control = cforest_unbiased(ntree=10,mtry=3,trace=TRUE))


##Create factor object for raster predict
f = list(levels(training$Protected),levels(training$Region),levels(training$Population))

##Rename factor lists to match original data
names(f) <- c("Protected", "Region", "Population")

##create a wrapper function for Cforest predict in raster package
predfun <- function(m, d, ...) predict(m, newdata=d, ...)

##Predict Raster with no Probabilities - results in correct prediction
Attempt_Processed = raster::predict(segment_brick, cforest_example, OOB=TRUE, factors=f, fun=predfun,progress="text",na.rm=TRUE)
plot(Attempt_Processed)

##Predict Raster with Probabilities - results in odd banded image
Attempt_Processed_prob = raster::predict(segment_brick, cforest_example, OOB=TRUE, factors=f, fun=predfun,progress="text",na.rm=TRUE,type="prob")
plot(Attempt_Processed_prob)

Correct output based on classification predict

Banded, incorrect output based on type="prob" predict

1

There are 1 answers

0
Robert Hijmans On BEST ANSWER

The way to go about this is to have a look what the model's predict function returns

p <- predict(cforest_example, newdata=training[1:3,], OOB=TRUE, type="prob")
p
#$`1`
#     Presence.0 Presence.1
#[1,]  0.5461538  0.4538462
#
#$`2`
#     Presence.0 Presence.1
#[1,]  0.5461538  0.4538462
#
#$`3`
#     Presence.0 Presence.1
#[1,]  0.5461538  0.4538462

It returns a list, which gets unlisted by raster::predict

unlist(p)
#       11        12        21        22        31        32 
#0.5461538 0.4538462 0.5461538 0.4538462 0.5461538 0.4538462 

and that explains the banding pattern

You can fix this with a prediction function like this

pfun <- function(m, d, ...) {
    p <- predict(m, newdata=d, ...)
    matrix(unlist(p), ncol=2, byrow=TRUE)
}
    
pfun(cforest_example, training[1:3,], OOB=TRUE, type="prob")
#          [,1]      [,2]
#[1,] 0.5461538 0.4538462
#[2,] 0.5461538 0.4538462
#[3,] 0.5461538 0.4538462

Which you can now use with the RasterBrick

prob = raster::predict(segment_brick, cforest_example, OOB=TRUE, factors=f, fun=pfun, index=1:2, na.rm=TRUE,type="prob")
plot(prob)
prob
#class      : RasterBrick 
#dimensions : 13, 6, 78, 2  (nrow, ncol, ncell, nlayers)
#resolution : 0.04166667, 0.04166669  (x, y)
#extent     : 39, 39.25, 3.833333, 4.375  (xmin, xmax, ymin, ymax)
#crs        : +proj=longlat +datum=WGS84 +no_defs 
#source     : memory
#names      :   layer.1,   layer.2 
#min values : 0.5461538, 0.4538462 
#max values : 0.5461538, 0.4538462 

As you only have 2 probabilities, you could simplify the function to return only the probability for one class (absence in this case):

pfun <- function(m, d, ...) {
    p <- predict(m, newdata=d, ...)
    matrix(unlist(p), ncol=2, byrow=TRUE)[,1]
}
prob = raster::predict(segment_brick, cforest_example, OOB=TRUE, factors=f, fun=pfun, na.rm=TRUE,type="prob")