Negative confidence interval for geom_smooth() in ggplot

59 views Asked by At

I have an experiment where participants complete a questionnaire (each participant has a sumpdi score for this) and an experiment with two types of (within-participant) condition (0 and 25).

I want to plot mean task response across sumpdi scores and conditions, but I am having trouble depicting confidence intervals.

I first aggregate the data:

 agg_forplot<- aggregate(Response ~ sumpdi+ Condition, df_forplot,
 FUN=mean)

I then create a plot in ggplot2:

Plot_C<-ggplot(agg_forplot, aes(x=sumpdi, y=Response, colour=Condition))+ geom_smooth(method='lm', se=T, fill='light blue')+geom_point()+
     ylab("Mean % 'Tone Present' Responses")+ xlab('Summary PDI Score') + labs(color= 'Condition') +
      scale_color_manual(labels = c("No tone", "25%"), values= c("#5CC0C0","#FF88FF"))+  mytheme

Plot_C

Initial Plot

The 'se' setting in geom_smooth() generates a negative value for the confidence interval around summary PDI score=0. However, it is obviously not possible to get a negative percentage.

Why does this happen and how can I correct this? I have tried adding the argument +ylim(0,1) but this removes the confidence interval completely from this section of the graph.

Edit Here is the output of dput(agg_forplot) to show my data:

structure(list(sumpdi = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 0, 
1, 2, 3, 4, 5, 6, 7, 8, 9, 11), Condition = structure(c(1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L), levels = c("0", "25", "50", "75"), class = "factor"), 
    Response = c(0.0636323497219099, 0.0851942941465814, 0.0885739075300753, 
    0.0917113758765922, 0.0329839417565963, 0.0784596540124087, 
    0.102004605958778, 0.0945945945945946, 0.0762726196516151, 
    0.0422535211267606, 0.263157894736842, 0.276791358215455, 
    0.343362166227905, 0.314720315633304, 0.264764719482896, 
    0.23228019576997, 0.291160062315599, 0.238023716545795, 0.540540540540541, 
    0.36038961038961, 0.117647058823529, 0.533333333333333)), row.names = c(NA, 
-22L), class = "data.frame")

I have also since noticed that generating means by aggregating across Pt_ID prevents negative confidence interval values (see plot below), but I am unsure why this is:

agg_forplot<- aggregate(Response ~ sumpdi+ Condition+ Pt_ID, df_forplot, FUN=mean)

Edited plot with Pt_ID-aggregated data

The output for dput(agg_forplot):

structure(list(sumpdi = c(5, 5, 6, 6, 4, 4, 3, 3, 6, 6, 0, 0, 
6, 6, 0, 0, 5, 5, 0, 0, 0, 0, 6, 6, 3, 3, 2, 2, 3, 3, 3, 3, 0, 
0, 5, 5, 0, 0, 3, 3, 8, 8, 2, 2, 2, 2, 4, 4, 7, 7, 3, 3, 0, 0, 
6, 6, 0, 0, 5, 5, 3, 3, 3, 3, 1, 1, 5, 5, 6, 6, 5, 5, 4, 4, 5, 
5, 0, 0, 1, 1, 2, 2, 8, 8, 2, 2, 11, 11, 4, 4, 5, 5, 9, 9, 5, 
5, 0, 0, 1, 1, 1, 1, 0, 0), Condition = structure(c(1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L), levels = c("0", "25", "50", "75"), class = "factor"), 
    Pt_ID = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 
    6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 
    12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 
    18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L, 
    24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L, 
    30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L, 
    36L, 37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L, 
    42L, 43L, 43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L, 
    48L, 49L, 49L, 50L, 50L, 51L, 51L, 52L, 52L), levels = c("1006", 
    "1007", "1013", "1014", "1017", "1020", "1021", "1022", "1024", 
    "1026", "1028", "1032", "1033", "1034", "1036", "1037", "1039", 
    "1040", "1041", "1042", "1043", "1045", "1046", "1047", "1051", 
    "1053", "1055", "1056", "1057", "1058", "1059", "1060", "1061", 
    "1063", "1064", "1065", "1066", "1069", "1071", "1072", "1073", 
    "1074", "1078", "1079", "1085", "1086", "1087", "1088", "1089", 
    "1090", "1092", "1093"), class = "factor"), Response = c(0.0882352941176471, 
    0.108108108108108, 0.230769230769231, 0.25, 0.117647058823529, 
    0.303030303030303, 0.0303030303030303, 0.24390243902439, 
    0.175675675675676, 0.472222222222222, 0.115942028985507, 
    0.166666666666667, 0.0289855072463768, 0.024390243902439, 
    0.0526315789473684, 0.0882352941176471, 0.0133333333333333, 
    0.307692307692308, 0.0571428571428571, 0.189189189189189, 
    0, 0.0571428571428571, 0.114285714285714, 0.27027027027027, 
    0.0571428571428571, 0.354838709677419, 0.0294117647058824, 
    0.129032258064516, 0.0571428571428571, 0.515151515151515, 
    0.0579710144927536, 0.0294117647058823, 0.027027027027027, 
    0.846153846153846, 0.0714285714285714, 0.263157894736842, 
    0.145161290322581, 0.131578947368421, 0.150684931506849, 
    0.255813953488372, 0.174603174603175, 0.333333333333333, 
    0.222222222222222, 0.34375, 0.0422535211267606, 0.0769230769230769, 
    0.0135135135135135, 0.114285714285714, 0.0945945945945946, 
    0.540540540540541, 0.0657894736842105, 0.315789473684211, 
    0.0273972602739726, 0.266666666666667, 0.0675675675675676, 
    0.233333333333333, 0.0410958904109589, 0.142857142857143, 
    0.0138888888888889, 0.166666666666667, 0.0833333333333333, 
    0.2, 0.228571428571429, 0.205882352941176, 0.0441176470588235, 
    0.0857142857142857, 0.028169014084507, 0.375, 0.0163934426229508, 
    0.193548387096774, 0.391304347826087, 0.5, 0, 0.194444444444444, 
    0.0625, 0.333333333333333, 0.0307692307692308, 0.457142857142857, 
    0.128571428571429, 0.205128205128205, 0.0933333333333333, 
    0.473684210526316, 0, 0.378378378378378, 0.0769230769230769, 
    0.542857142857143, 0.263157894736842, 0.533333333333333, 
    0, 0.36, 0.05, 0.3125, 0.0422535211267606, 0.117647058823529, 
    0, 0.257142857142857, 0.0151515151515152, 0.235294117647059, 
    0.157894736842105, 0.696969696969697, 0.0277777777777778, 
    0.424242424242424, 0.2, 0.378378378378378)), row.names = c(NA, 
-104L), class = "data.frame")
1

There are 1 answers

2
Allan Cameron On

The essential problem you have is that you are trying to do linear regression on proportion data. This doesn't work because the errors are assumed to be gaussian and therefore the model fit will have nonsensical confidence intervals.

The best way to do this would be via logistic regression if you have the original binary data, but your data seems to come in proportion format. You could use beta regression, but AFAIK the betareg package doesn't allow confidence intervals.

One other option is to transform your response variable into a logistic scale, fit the model, get the predictions, then reverse the transform:

library(tidyverse)

df <- agg_forplot %>%
  group_by(sumpdi, Condition) %>%
  summarize(Response = mean(Response), .groups = 'drop')

mod <- lm(Response ~ sumpdi*Condition, within(df, Response <- qlogis(Response)))
pred_df <- data.frame(sumpdi = rep(seq(0, 11, length = 200), 2),
                      Condition = rep(c('0', '25'), each = 200))

preds <- predict(mod, pred_df, se.fit = TRUE)

pred_df$Response <- plogis(preds$fit)
pred_df$upper <- plogis(preds$fit + 1.96 * preds$se.fit)
pred_df$lower <- plogis(preds$fit - 1.96 * preds$se.fit)

ggplot(df, aes(sumpdi, Response, colour = Condition, group = Condition)) + 
  geom_ribbon(data = pred_df, aes(ymax = upper, ymin = lower,
                                  fill = after_scale(alpha(colour, 0.2))),
              linetype = 0) +
  geom_line(data = pred_df) +
  geom_point() +
  xlab('Summary PDI Score') + 
  scale_color_manual(labels = c("No tone", "25%"), 
                     values= c("orangered","deepskyblue4")) + 
  scale_y_continuous("Mean % 'Tone Present' Responses", limits = c(0, 0.6)) +
  theme_classic()

enter image description here