Count data models: PhD publications

# data from Long (1997)
# see http://data.princeton.edu/wws509/stata/overdispersion.html for Stata analysis

a basic histogram of articles published

hist(PhdPubs$articles, breaks=0:19, col="pink", xlim=c(0,20),
     xlab="Number of Articles")

Fit the poisson model

phd.pois <- glm(articles ~ ., data=PhdPubs, family=poisson)
library(car) # Companion to Applied Regression # Companion to Applied Regression
Anova(phd.pois)

## Analysis of Deviance Table (Type II tests)
## 
## Response: articles
##             LR Chisq Df Pr(>Chisq)    
## female          17.1  1    3.6e-05 ***
## married          6.6  1       0.01 *  
## kid5            22.1  1    2.6e-06 ***
## phdprestige      1.0  1       0.32    
## mentor         126.8  1    < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

interpreting coefficients

round(cbind(beta = coef(phd.pois),
            expbeta = exp(coef(phd.pois)),
            pct = 100 * (exp(coef(phd.pois)) - 1)), 3)

##               beta expbeta    pct
## (Intercept)  0.266   1.304  30.43
## female      -0.224   0.799 -20.10
## married      0.157   1.170  17.04
## kid5        -0.185   0.831 -16.88
## phdprestige  0.025   1.026   2.57
## mentor       0.025   1.026   2.56

check for interactions

phd.pois1 <- update(phd.pois, . ~ .^2)
Anova(phd.pois1)

## Analysis of Deviance Table (Type II tests)
## 
## Response: articles
##                     LR Chisq Df Pr(>Chisq)    
## female                  14.5  1    0.00014 ***
## married                  6.2  1    0.01277 *  
## kid5                    19.5  1    9.8e-06 ***
## phdprestige              1.0  1    0.32655    
## mentor                 128.1  1    < 2e-16 ***
## female:married           0.3  1    0.60995    
## female:kid5              0.1  1    0.72929    
## female:phdprestige       0.2  1    0.63574    
## female:mentor            0.0  1    0.91260    
## married:kid5                  0               
## married:phdprestige      1.7  1    0.19153    
## married:mentor           1.2  1    0.28203    
## kid5:phdprestige         0.2  1    0.68523    
## kid5:mentor              2.8  1    0.09290 .  
## phdprestige:mentor       3.8  1    0.05094 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(phd.pois, phd.pois1, test="Chisq")

## Analysis of Deviance Table
## 
## Model 1: articles ~ female + married + kid5 + phdprestige + mentor
## Model 2: articles ~ female + married + kid5 + phdprestige + mentor + female:married + 
##     female:kid5 + female:phdprestige + female:mentor + married:kid5 + 
##     married:phdprestige + married:mentor + kid5:phdprestige + 
##     kid5:mentor + phdprestige:mentor
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       909       1634                       
## 2       900       1618  9     15.2    0.086 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

LRstats(phd.pois, phd.pois1)

## Likelihood summary table:
##            AIC  BIC LR Chisq  Df Pr(>Chisq)    
## phd.pois  3313 3342     1634 909     <2e-16 ***
## phd.pois1 3316 3388     1618 900     <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## model diagnostics

op <- par(mfrow=c(1,2), mar=c(4,4,2,1)+.1)
plot(phd.pois, which=c(1,5))

par(op)

### component-plus-residual plot for non-linearity

crPlot(phd.pois, "mentor", pch=16, lwd=4, id = list(n=2))

## [1] 913 911 328 803

better version

op <- par(mar=c(4,4,2,1)+.1)
crPlot(phd.pois, "mentor", pch=16, lwd=4, id = list(n=2), 
    cex.lab=1.4, xlab="mentor publications" )

## [1] 913 911 328 803

text(70, 2.5, paste("b =",round( coef(phd.pois)["mentor"], 3)), col="red", cex=1.5)

par(op)

### Influence plot

influencePlot(phd.pois, id = list(n=2))

##     StudRes     Hat  CookD
## 328   -3.73 0.19336 0.3676
## 803   -1.03 0.08696 0.0149
## 913    5.38 0.00356 0.0434
## 914    5.54 0.01000 0.1098
## 915    5.15 0.03550 0.2833

Effect plots

library(effects) # Effect Displays for Linear, Generalized Linear, and Other Models
plot(allEffects(phd.pois))

Overdispersion

mean and variance

with(PhdPubs, c(mean=mean(articles), 
                var=var(articles), 
                ratio=var(articles)/mean(articles)))

##  mean   var ratio 
##  1.69  3.71  2.19

estimate of phi = dispersion parameter

with(phd.pois, deviance/df.residual)

## [1] 1.8

sum(residuals(phd.pois, type = "pearson")^2)/phd.pois$df.residual

## [1] 1.83

Fit the quasi-poisson model

phd.qpois <- glm(articles ~ ., data=PhdPubs, family=quasipoisson)

Fit the negative-binomial model

phd.nbin  <- glm.nb(articles ~ ., data=PhdPubs)

Estimate of theta

summary(phd.nbin)$theta

## [1] 2.27

## compare models

LRstats(phd.pois, phd.qpois, phd.nbin)

## Likelihood summary table:
##            AIC  BIC LR Chisq  Df Pr(>Chisq)    
## phd.pois  3313 3342     1634 909     <2e-16 ***
## phd.qpois                    909               
## phd.nbin  3135 3169     1004 909      0.015 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## compare standard errors

phd.SE <- sqrt(cbind(
  pois = diag(vcov(phd.pois)),
  qpois = diag(vcov(phd.qpois)),
  nbin = diag(vcov(phd.nbin))))
round(phd.SE, 4)

##               pois  qpois   nbin
## (Intercept) 0.0996 0.1348 0.1327
## female      0.0546 0.0738 0.0726
## married     0.0613 0.0829 0.0819
## kid5        0.0401 0.0543 0.0528
## phdprestige 0.0253 0.0342 0.0343
## mentor      0.0020 0.0027 0.0032

phd.coef <- cbind(
  pois = coef(phd.pois),
  qpois = coef(phd.qpois),
  nbin = coef(phd.nbin))
round(phd.coef,3)

##               pois  qpois   nbin
## (Intercept)  0.266  0.266  0.213
## female      -0.224 -0.224 -0.216
## married      0.157  0.157  0.153
## kid5        -0.185 -0.185 -0.176
## phdprestige  0.025  0.025  0.029
## mentor       0.025  0.025  0.029

round(cbind(beta = coef(phd.nbin),
            expbeta = exp(coef(phd.nbin)),
            pct = 100 * (exp(coef(phd.nbin)) - 1)), 3)

##               beta expbeta    pct
## (Intercept)  0.213   1.237  23.73
## female      -0.216   0.806 -19.45
## married      0.153   1.165  16.51
## kid5        -0.176   0.838 -16.17
## phdprestige  0.029   1.030   2.98
## mentor       0.029   1.029   2.91

zero-inflated and hurdle models

if (!require(countreg)) install.packages("countreg", repos="http://R-Forge.R-project.org")
# library(countreg) # Count Data Regression

plot zero articles

plot(factor(articles==0) ~ mentor, data=PhdPubs, 
    ylevels=2:1, ylab="Zero articles", 
    breaks=quantile(mentor, probs=seq(0,1,.2)), cex.lab=1.25)

Fit ZI and hurdle models

phd.zip <- zeroinfl(articles ~ ., data=PhdPubs, dist="poisson")
phd.znb <- zeroinfl(articles ~ ., data=PhdPubs, dist="negbin")

phd.hp  <- hurdle(articles ~ ., data=PhdPubs, dist="poisson")
phd.hnb <- hurdle(articles ~ ., data=PhdPubs, dist="negbin")

show coefficients

phd.zip

## 
## Call:
## zeroinfl(formula = articles ~ ., data = PhdPubs, dist = "poisson")
## 
## Count model coefficients (poisson with log link):
## (Intercept)       female      married         kid5  phdprestige       mentor  
##      0.5992      -0.2088       0.1062      -0.1427       0.0070       0.0178  
## 
## Zero-inflation model coefficients (binomial with logit link):
## (Intercept)       female      married         kid5  phdprestige       mentor  
##    -0.56332      0.10816     -0.35558      0.21974     -0.00537     -0.13313

phd.hp

## 
## Call:
## hurdle(formula = articles ~ ., data = PhdPubs, dist = "poisson")
## 
## Count model coefficients (truncated poisson with log link):
## (Intercept)       female      married         kid5  phdprestige       mentor  
##     0.63938     -0.22824      0.09899     -0.14173     -0.00279      0.01859  
## 
## Zero hurdle model coefficients (binomial with logit link):
## (Intercept)       female      married         kid5  phdprestige       mentor  
##      0.1784      -0.2508       0.3277      -0.2854       0.0428       0.0792

compare models

LRstats(phd.pois, phd.nbin, phd.zip, phd.znb, phd.hp, phd.hnb, 
        sortby="BIC")

## Likelihood summary table:
##           AIC  BIC LR Chisq  Df Pr(>Chisq)    
## phd.pois 3313 3342     3301 909     <2e-16 ***
## phd.hp   3235 3292     3211 903     <2e-16 ***
## phd.zip  3234 3291     3210 903     <2e-16 ***
## phd.hnb  3131 3194     3105 902     <2e-16 ***
## phd.znb  3126 3188     3100 902     <2e-16 ***
## phd.nbin 3135 3169     3121 909     <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

examine the ZIP model in more detail

NB: only mentor is important in the zero model

coeftest(phd.zip)

## 
## t test of coefficients:
## 
##                   Estimate Std. Error t value Pr(>|t|)    
## count_(Intercept)  0.59918    0.11861    5.05  5.3e-07 ***
## count_female      -0.20879    0.06353   -3.29   0.0011 ** 
## count_married      0.10623    0.07097    1.50   0.1348    
## count_kid5        -0.14271    0.04744   -3.01   0.0027 ** 
## count_phdprestige  0.00700    0.02981    0.23   0.8145    
## count_mentor       0.01785    0.00233    7.65  5.3e-14 ***
## zero_(Intercept)  -0.56332    0.49405   -1.14   0.2545    
## zero_female        0.10816    0.28173    0.38   0.7011    
## zero_married      -0.35558    0.31796   -1.12   0.2637    
## zero_kid5          0.21974    0.19658    1.12   0.2639    
## zero_phdprestige  -0.00537    0.14118   -0.04   0.9697    
## zero_mentor       -0.13313    0.04643   -2.87   0.0042 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

coeftest(phd.znb)

## 
## t test of coefficients:
## 
##                   Estimate Std. Error t value Pr(>|t|)    
## count_(Intercept)   0.3655     0.1399    2.61   0.0091 ** 
## count_female       -0.1946     0.0758   -2.57   0.0104 *  
## count_married       0.1015     0.0843    1.20   0.2291    
## count_kid5         -0.1516     0.0542   -2.80   0.0053 ** 
## count_phdprestige   0.0156     0.0345    0.45   0.6511    
## count_mentor        0.0244     0.0035    6.97  5.9e-12 ***
## zero_(Intercept)   -0.2943     1.2827   -0.23   0.8186    
## zero_female         0.6580     0.8557    0.77   0.4421    
## zero_married       -1.4741     0.9176   -1.61   0.1085    
## zero_kid5           0.6241     0.4410    1.42   0.1574    
## zero_phdprestige   -0.0113     0.2888   -0.04   0.9689    
## zero_mentor        -0.8765     0.3152   -2.78   0.0055 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

phd.zip1 <- zeroinfl(articles ~ .| mentor, data=PhdPubs, dist="poisson")
phd.znb1 <- zeroinfl(articles ~ .| mentor, data=PhdPubs, dist="negbin")

summary(phd.zip1)

## 
## Call:
## zeroinfl(formula = articles ~ . | mentor, data = PhdPubs, dist = "poisson")
## 
## Pearson residuals:
##    Min     1Q Median     3Q    Max 
## -2.295 -0.871 -0.262  0.540  7.243 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.58819    0.10996    5.35  8.8e-08 ***
## female      -0.21791    0.05879   -3.71  0.00021 ***
## married      0.13597    0.06602    2.06  0.03945 *  
## kid5        -0.16262    0.04338   -3.75  0.00018 ***
## phdprestige  0.00671    0.02729    0.25  0.80574    
## mentor       0.01804    0.00229    7.88  3.3e-15 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.6865     0.2059   -3.33  0.00086 ***
## mentor       -0.1303     0.0404   -3.23  0.00125 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 16 
## Log-likelihood: -1.61e+03 on 8 Df

LRstats(phd.pois, phd.nbin, phd.zip, phd.znb, phd.hp, phd.hnb, phd.zip1, phd.znb1, 
    sortby="BIC")

## Likelihood summary table:
##           AIC  BIC LR Chisq  Df Pr(>Chisq)    
## phd.pois 3313 3342     3301 909     <2e-16 ***
## phd.hp   3235 3292     3211 903     <2e-16 ***
## phd.zip  3234 3291     3210 903     <2e-16 ***
## phd.zip1 3227 3266     3211 907     <2e-16 ***
## phd.hnb  3131 3194     3105 902     <2e-16 ***
## phd.znb  3126 3188     3100 902     <2e-16 ***
## phd.nbin 3135 3169     3121 909     <2e-16 ***
## phd.znb1 3124 3168     3106 906     <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

effect plot for important terms in the NBIN model

plot(allEffects(phd.nbin)[c(1,3,5)], rows=1, cols=3)

plot the zero model

phd.zero <- glm((articles==0) ~ mentor, data=PhdPubs, family=binomial)
plot(allEffects(phd.zero), main="Mentor effect on not publishing")