#install.packages("remotes")
#remotes::install_github("mbrussell/stats4nr")
library(stats4nr)
library(tidyverse)
library(patchwork)
data(fishing)
summary(fishing)
#> nofish livebait camper persons
#> Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.000
#> 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:2.000
#> Median :0.000 Median :1.000 Median :1.000 Median :2.000
#> Mean :0.296 Mean :0.864 Mean :0.588 Mean :2.528
#> 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:4.000
#> Max. :1.000 Max. :1.000 Max. :1.000 Max. :4.000
#> child count
#> Min. :0.000 Min. : 0.000
#> 1st Qu.:0.000 1st Qu.: 0.000
#> Median :0.000 Median : 0.000
#> Mean :0.684 Mean : 3.296
#> 3rd Qu.:1.000 3rd Qu.: 2.000
#> Max. :3.000 Max. :149.000
glm1<-glm(count~persons + child + camper,family = "poisson", data = fishing)
ModelOutputs<-data.frame(Fitted=fitted(glm1),
Residuals=resid(glm1))
p3<-ggplot(ModelOutputs)+
geom_point(aes(x=Fitted,y=Residuals))+
theme_classic()+
labs(y="Residuals",x="Fitted Values")
p4<-ggplot(ModelOutputs) +
stat_qq(aes(sample=Residuals))+
stat_qq_line(aes(sample=Residuals))+
theme_classic()+
labs(y="Sample Quartiles",x="Theoretical Quartiles")
p3+p4
Zero Inflated GLMs
Overcoming Zero-Inflated Data with GLMs
So in the common issues with GLMs tutorial we talked about zero inflated data and how this type of data will cause a issues, often diagnosed by residuals plots. Here we will have some examples of discovering zero inflated data then how to model more appropriately using zero inflated distributions often called ZIP models. But first a little bit of theory and thought.
Cause of the Zeros?
As we have mentioned throughout the tutorials, the cause of data allows us best to model it and discover or describe natural phenomena. This structure data allows us to effectively create models that capture the causal links between factors. Likewise, with high levels of zeros we want to know why there are lots of zeros? Are the Zeros True Zeros? or are they false zeros?
A false zero is often called non-detection, as with the experimental fish example, if we have a net with holes bigger than a certain size of fish we will get lots of zeros for that fish, even if they were in the net but just swam through before we counted the sample.
A true zero would be there were non of that fish there. This is almost impossible to know for sure but if we suspect the zeros are true zeros (we didn’t use a net, we used a bucket and nothing could escape if it was there) then we can model those zeros as well as the counts we do get in a very similar way.
We need to decide if we want to model the process of the zeros occuring or not. This will be case specific.
Zero Inflated Poisson Data
What better way to explore poisson models than with fish data. We will use the remotes package to install a package from github called stats4nr. Within this package there is a fishing data set of fish caught in counts with livebait, whether they came in a camper, number of persons in the group and number of children in the group.
As expected this isn’t very good at all, with much of the residuals all around zero, maybe there are too many zeros? Lets look at the distribution of count data.
ggplot(fishing)+
geom_bar(aes(count),fill="darkcyan")+
labs(x="Count of Fish Caught", y="Count")+
theme_classic()
Well there are a lot of zeros there! But remember poisson will break down when the variance isn’t proportional to the expected value, so we cal look at the variance and the mean and see how similar they are.
Definitely not. This shows extreme over dispersion. We have discussed using a Negative Binomial model for this situation. But as the name of tutorial shows we are looking at Zero-Inflated models. But lets try the negative binomial anyway..
library(MASS)
glm2<- glm.nb(count ~ persons + child + camper,
data = fishing)
ModelOutputs<-data.frame(Fitted=fitted(glm2),
Residuals=resid(glm2))
p3<-ggplot(ModelOutputs)+
geom_point(aes(x=Fitted,y=Residuals))+
theme_classic()+
labs(y="Residuals",x="Fitted Values")
p4<-ggplot(ModelOutputs) +
stat_qq(aes(sample=Residuals))+
stat_qq_line(aes(sample=Residuals))+
theme_classic()+
labs(y="Sample Quartiles",x="Theoretical Quartiles")
p3+p4
It does go some of the way, but there are still issues around the zero values in the Homogeneity of variance plot! Okay lets get on task then, we will use the zeroinfl() function from the pscl package. We will need to extract the fitted and residuals to plot ourselves as the check_model() function doesn’t support this glm type yet.
#install.packages("pscl")
library(pscl)
glm3 <- zeroinfl(count ~ persons + child + camper| persons+child+camper,
data = fishing, dist = "poisson")
ModelOutputs<-data.frame(Fitted=fitted(glm3),
Residuals=resid(glm3))
p1<-ggplot(ModelOutputs)+
geom_point(aes(x=Fitted,y=Residuals))+
theme_classic()+
labs(y="Residuals",x="Fitted Values")
p2<-ggplot(ModelOutputs) +
stat_qq(aes(sample=Residuals))+
stat_qq_line(aes(sample=Residuals))+
theme_classic()+
labs(y="Sample Quartiles",x="Theoretical Quartiles")
p1+p2
This is definitely still not great.
Zero Inflated Simulated Data
For a full example we will actually make up some data. Lets pretend we have count of fish inside and outside of a Marine Protected Area (MPA). We will also pretend our MPA is unbelieveablly good at providing protection to the species of fish we are looking at. But we have some unknown situation that means that sometimes we have high levels of zero records.
We will first make our factors, Time and MPA, then we will create poisson data where lambda is influenced by both the MPA and Time factors. Lets visualise each factor so we can see how we have created the data and what our data look like. Don’t forget we still need to add in the zero-inflation. We will create lambda from a linear equation using b0, b1, b2 and b3 as the intercept and the effects of Time, MPA, and interaction of Time and MPA.
library(patchwork)
n <- 5000
MPA <- sample(c(0,1), size = n, replace = TRUE)
Time <- c(1:100)
b0<-log(2) # Intercept
b1<-log(1.1) # Effect of Time
b2<-log(1.2) # Effect of MPA
b3<-log(1.3) # Effect of interaction between Time and MPA
lambda<-exp(b0 + b1 * log(Time) + b2 * (MPA==1) + b3 * (MPA==1) * log(Time))
y_sim <- rpois(n = n, lambda = lambda)
df<-data.frame(MPA=as.factor(MPA),
Time=Time,
Count=y_sim)
p6<-ggplot(df)+
geom_bar(aes(x=MPA),
fill="darkcyan")+
theme_classic()+
labs(y="Count",x="MPA")
p7<-ggplot(df)+
geom_bar(aes(x=Time),
fill="darkcyan")+
theme_classic()+
labs(y="Count",x="Time")
p8<-ggplot(df)+
geom_bar(aes(x=Count),
fill="darkcyan")+
theme_classic()+
labs(y="Count",x="Fish Count")
p6+p7+p8
ggplot(df)+
geom_point(aes(x=Time,y=Count,colour=MPA))+
theme_classic()+
scale_colour_manual(values=c("darkgoldenrod","darkcyan"))+
labs(y="Fish Count",x="Time")
Now lets create a high number of zeros, which are going to more likely to occur (with a probability of 0.1) if it is inside the MPA but not difference with time.
df1<-df %>%
mutate(Count_Zeros=Count*rbinom(MPA, size = 1, prob=0.7))
ggplot(df1)+
geom_point(aes(x=Time,y=Count_Zeros,colour=MPA))+
theme_classic()+
scale_colour_manual(values=c("darkgoldenrod","darkcyan"))+
labs(y="Fish Count",x="Time")
Now lets try model this.
glm4 <- glm(Count_Zeros ~ MPA*Time,
data = df1, family = "poisson")
ModelOutputs<-data.frame(Fitted=fitted(glm4),
Residuals=resid(glm4))
p1<-ggplot(ModelOutputs)+
geom_point(aes(x=Fitted,y=Residuals))+
theme_classic()+
labs(y="Residuals",x="Fitted Values")
p2<-ggplot(ModelOutputs) +
stat_qq(aes(sample=Residuals))+
stat_qq_line(aes(sample=Residuals))+
theme_classic()+
labs(y="Sample Quartiles",x="Theoretical Quartiles")
p1+p2
Well this model looks pretty bad, not surprisingly.
glm5 <- zeroinfl(Count_Zeros ~ MPA*Time|MPA,
data = df1, dist = "poisson")
ModelOutputs<-data.frame(Fitted=fitted(glm5),
Residuals=resid(glm5))
p1<-ggplot(ModelOutputs)+
geom_point(aes(x=Fitted,y=Residuals))+
theme_classic()+
labs(y="Residuals",x="Fitted Values")
p2<-ggplot(ModelOutputs) +
stat_qq(aes(sample=Residuals))+
stat_qq_line(aes(sample=Residuals))+
theme_classic()+
labs(y="Sample Quartiles",x="Theoretical Quartiles")
p1+p2
This looks better, again not perfect but pretty good. A more comprehensive example will come when we do GLMMs.