Modelos de Regressão para Dados de Contagem com o R

github.com/leg-ufpr/MRDCr

Função Densidade

Se uma variável aleatória \(Y\) tem distribuição de probabilidades Poisson generalizada, então sua função de probabilidade é

\[ f(y) = \begin{cases} \theta (\theta + \gamma y)^{y - 1} \exp\{-(\theta + \gamma y)\}, & y = 0, 1, 2, \ldots \\ 0, & y > m \text{ quando } \gamma < 0 \end{cases} \]

# Função densidade na parametrização original.
dpg0 <- function(y, theta, gamma, m = 4) {
    if (gamma < 0) {
        m <- max(c(m, floor(-theta/gamma)))
        if (gamma < max(c(-1, -theta/m))) {
            m <- 0
        } else {
            m <- as.integer(y <= m)
        }
    } else {
        m <- 1
    }
    z <- theta + gamma * y
    k <- exp(lfactorial(y))
    fy <- m * theta * z^(y - 1) * exp(-z)/k
    return(fy)
}

# Caso Poisson (gamma == 0).
y <- 0:30
theta <- 10
gamma <- 0

fy <- dpg0(y = y, theta = theta, gamma = gamma)
plot(fy ~ y, type = "h")
lines(y + 0.3, dpois(y, lambda = theta), type = "h", col = 2)

Recursos interativos com o rpanel

library(rpanel)

react <- function(panel){
    with(panel,
    {
        m <- THETA/(1 - GAMMA)
        s <- sqrt(THETA/(1 - GAMMA)^3)
        from <- floor(max(c(0, m - 5 * s)))
        to <- ceiling(max(c(YMAX, m + 5 * s)))
        y <- from:to
        py <- dpg0(y = y, theta = THETA, gamma = GAMMA)
        if (POIS) {
            pz <- dpois(y, lambda = m)
        } else {
            pz <- 0
        }
        if (any(!is.finite(py))) {
            plot(x = NULL, y = NULL,
                 xlim = c(0, 1), ylim = c(0, 1),
                 axes = FALSE, ann = FALSE)
            text(x = 0.5, y = 0.5, labels = "Valores não finitos")
        } else {
            plot(py ~ y, type = "h",
                 ylim = c(0, max(c(py, pz))),
                 xlab = expression(y),
                 ylab = expression(f(y)))
            mtext(side = 3,
                  text = substitute(sum(f(y)) == s,
                                    list(s = round(sum(py), 5))))
            if (EX) {
                abline(v = m, col = 2)
            }
            if (POIS) {
                lines(y + 0.3, pz, type = "h", col = 3)
            }
        }
    })
    panel
}

panel <- rp.control(title = "Poisson Generalizada",
                    size = c(300, 100), YMAX = 30, m = 4)
rp.slider(panel = panel, action = react,
          variable = THETA, title = "theta",
          from = 0.1, to = 30,
          initval = 5, resolution = 0.1,
          showvalue = TRUE)
rp.slider(panel = panel, action = react,
          variable = GAMMA, title = "gamma",
          from = -1, to = 0.99,
          initval = 0, resolution = 0.01,
          showvalue = TRUE)
rp.checkbox(panel = panel,
            variable = EX, action = react, title = "E(Y)",
            labels = "Mostrar o valor esperado?")
rp.checkbox(panel = panel,
            variable = POIS, action = react, title = "Poisson",
            labels = "Adicionar a distribução Poisson?")
rp.do(panel = panel, action = react)

O espaço paramétrico de \(\theta\) e \(\gamma\)

#-----------------------------------------------------------------------
# Gráfico do espaço paramétrico de theta x gamma.

library(latticeExtra)

y <- 0:50
fun <- Vectorize(vectorize.args = c("theta", "gamma"),
                 FUN = function(theta, gamma) {
                     sum(dpg0(y = y, theta = theta, gamma = gamma))
                 })

grid <- list(theta = seq(0.1, 10, by = 0.1),
             gamma = seq(-0.98, 0.98, by = 0.02))
grid$sum <- with(grid, outer(theta, gamma, fun))
grid <- with(grid,
             cbind(expand.grid(theta = theta, gamma = gamma),
                   data.frame(sum = c(sum))))

# levelplot(sum ~ theta + gamma, data = grid,
#           col.regions = heat.colors) +
#     layer(panel.abline(h = 0))

levelplot(sum ~ theta + gamma,
          data = subset(grid, round(sum, 3) == 1),
          col.regions = heat.colors) +
    layer(panel.abline(h = 0))

subset(grid, round(sum, 3) != 1 & theta > 6 & gamma < 0)
##      theta gamma sum
## 481    8.1  -0.9   0
## 2968   6.8  -0.4   0

Modelo de Regressão com a Distribuição Poisson Generalizada

# Função densidade na parametrização de modelo de regressão.
dpg1 <- function(y, lambda, alpha) {
    k <- exp(lfactorial(y))
    w <- 1 + alpha * y
    z <- 1 + alpha * lambda
    fy <- (lambda/z)^(y) * w^(y - 1) * exp(-lambda * (w/z))/k
    return(fy)
}

react <- function(panel){
    with(panel,
    {
        m <- LAMBDA
        s <- sqrt(LAMBDA) * (1 + ALPHA * LAMBDA)
        from <- floor(max(c(0, m - 5 * s)))
        to <- ceiling(max(c(YMAX, m + 5 * s)))
        y <- from:to
        py <- dpg1(y = y, lambda = LAMBDA, alpha = ALPHA)
        if (POIS) {
            pz <- dpois(y, lambda = m)
        } else {
            pz <- 0
        }
        if (any(!is.finite(py))) {
            plot(x = NULL, y = NULL,
                 xlim = c(0, 1), ylim = c(0, 1),
                 axes = FALSE, ann = FALSE)
            text(x = 0.5, y = 0.5, labels = "Valores não finitos")
        } else {
            plot(py ~ y, type = "h",
                 ylim = c(0, max(c(py, pz))),
                 xlab = expression(y),
                 ylab = expression(f(y)))
            mtext(side = 3,
                  text = substitute(sum(f(y)) == s,
                                    list(s = round(sum(py), 5))))
            if (EX) {
                abline(v = m, col = 2)
            }
            if (POIS) {
                lines(y + 0.3, pz, type = "h", col = 3)
            }
        }
    })
    panel
}

panel <- rp.control(title = "Poisson Generalizada",
                    size = c(300, 100), YMAX = 30)
rp.slider(panel = panel, action = react,
          variable = LAMBDA, title = "lambda",
          from = 0.1, to = 30,
          initval = 5, resolution = 0.1,
          showvalue = TRUE)
rp.slider(panel = panel, action = react,
          variable = ALPHA, title = "alpha",
          from = -0.2, to = 0.4,
          initval = 0, resolution = 0.01,
          showvalue = TRUE)
rp.checkbox(panel = panel,
            variable = EX, action = react, title = "E(Y)",
            labels = "Mostrar o valor esperado?")
rp.checkbox(panel = panel,
            variable = POIS, action = react, title = "Poisson",
            labels = "Adicionar a distribução Poisson?")
rp.do(panel = panel, action = react)

Verossimilhança e Estimação

library(bbmle)
## Loading required package: stats4
library(corrplot)

# Função de log-Verossimilhança da Poisson Generalizada na
# parametrização de modelo de regressão.
PG_ll <- function(theta, y, X, offset = NULL) {
    # theta: vetor de parâmetros;
    #   theta[1]: parâmetro de dispersão (alpha);
    #   theta[-1]: parâmetro de locação (lambda);
    # y: variável resposta (contagem);
    # X: matriz do modelo linear;
    # offset: tamanho do domínio onde y foi medido;
    #----------------------------------------
    if (is.null(offset)) {
        offset <- 1L
    }
    lambda <- offset * exp(X %*% theta[-1])
    alpha <- theta[1]
    w <- 1 + alpha * y
    z <- 1 + alpha * lambda
    fy <- y * (log(lambda) - log(z)) +
        (y - 1) * log(w) -
        lambda * (w/z) -
        lfactorial(y)
    # Negativo da log-likelihood.
    -sum(fy)
}

#-----------------------------------------------------------------------
# Gerando uma amostra aleatória da Poisson.

# Offset = 2, lambda = 3.
y <- rpois(100, lambda = 2 * 3)

L <- list(y = y,
          offset = rep(2, length(y)),
          X = cbind(rep(1, length(y))))

start <- c(alpha = 0, lambda = 1)
parnames(PG_ll) <- names(start)

# Como \alpha foi fixado em 1, essa ll corresponde à Poisson.
n0 <- mle2(minuslogl = PG_ll,
           start = start, data = L,
           fixed = list(alpha = 0), vecpar = TRUE)

# Para conferir.
c(coef(n0)["lambda"],
  coef(glm(y ~ offset(log(L$offset)), family = poisson)))
##      lambda (Intercept) 
##    1.006131    1.006131
# Estimando o \alpha.
n1 <- mle2(PG_ll, start = start, data = L, vecpar = TRUE)
## Warning in log(z): NaNs produced
coef(n1)
##       alpha      lambda 
## 0.004356015 1.006131101
# Perfil de verossimilhança dos parâmetros.
plot(profile(n1))
## Warning in log(w): NaNs produced

# Covariância.
V <- cov2cor(vcov(n1))
corrplot.mixed(V, upper = "ellipse", col = "gray50")

dev.off()
## null device 
##           1

Número de Vagens Produzidas em Soja

Experimento fatorial (3 x 5), em delineamento de blocos casualizados, que estudou a influência de níveis de potássio na adubação em combinação com irrigação na produção de soja. As variáveis de contagem registradas nesse experimento foram o número de vagens viáveis (e não viáveis) e o número total de sementes por parcela.

library(lattice)

data(soja, package = "MRDCr")
str(soja)
## 'data.frame':    75 obs. of  5 variables:
##  $ K   : int  0 30 60 120 180 0 30 60 120 180 ...
##  $ umid: Factor w/ 3 levels "37,5","50","62,5": 1 1 1 1 1 2 2 2 2 2 ...
##  $ bloc: Factor w/ 5 levels "I","II","III",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ngra: int  136 159 156 171 190 140 193 200 208 237 ...
##  $ nvag: int  56 62 66 68 82 63 86 94 86 97 ...
# A observação 74 é um outlier.
soja <- soja[-74, ]

xyplot(nvag ~ K | umid, data = soja, layout = c(NA, 1),
       type = c("p", "smooth"),
       strip = strip.custom(strip.names = TRUE, var.name = "Umidade"))

soja <- transform(soja, K = factor(K))

#-----------------------------------------------------------------------
# Modelo Poisson.

m0 <- glm(nvag ~ bloc + umid * K, data = soja, family = poisson)

# Diagnóstico.
par(mfrow = c(2, 2))
plot(m0); layout(1)

# Medidas de decisão.
anova(m0, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: nvag
## 
## Terms added sequentially (first to last)
## 
## 
##        Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                      73     322.68              
## bloc    4   14.284        69     308.40  0.006442 ** 
## umid    2   92.911        67     215.49 < 2.2e-16 ***
## K       4  136.060        63      79.43 < 2.2e-16 ***
## umid:K  8   14.150        55      65.28  0.077926 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m0)
## 
## Call:
## glm(formula = nvag ~ bloc + umid * K, family = poisson, data = soja)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9681  -0.6393  -0.0195   0.4854   2.3306  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.95369    0.06892  57.363  < 2e-16 ***
## blocII        -0.02928    0.04091  -0.716  0.47414    
## blocIII       -0.07265    0.04136  -1.756  0.07902 .  
## blocIV        -0.12544    0.04194  -2.991  0.00278 ** 
## blocV         -0.10795    0.04297  -2.512  0.01199 *  
## umid50         0.13404    0.08765   1.529  0.12619    
## umid62,5       0.21656    0.08602   2.518  0.01181 *  
## K30            0.27427    0.08493   3.229  0.00124 ** 
## K60            0.30797    0.08432   3.652  0.00026 ***
## K120           0.32883    0.08395   3.917 8.97e-05 ***
## K180           0.25540    0.08528   2.995  0.00275 ** 
## umid50:K30     0.06322    0.11557   0.547  0.58433    
## umid62,5:K30  -0.10747    0.11536  -0.932  0.35152    
## umid50:K60     0.16561    0.11370   1.457  0.14521    
## umid62,5:K60   0.10735    0.11220   0.957  0.33869    
## umid50:K120    0.14920    0.11338   1.316  0.18818    
## umid62,5:K120  0.11839    0.11404   1.038  0.29922    
## umid50:K180    0.30370    0.11361   2.673  0.00751 ** 
## umid62,5:K180  0.19838    0.11256   1.762  0.07800 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 322.684  on 73  degrees of freedom
## Residual deviance:  65.278  on 55  degrees of freedom
## AIC: 557.23
## 
## Number of Fisher Scoring iterations: 4
#-----------------------------------------------------------------------
# Modelo Poisson Generalizado.

L <- with(soja, list(y = nvag, offset = 1, X = model.matrix(m0)))

# Usa as estimativas do Poisson como valore iniciais.
start <- c(alpha = 0, coef(m0))
parnames(PG_ll) <- names(start)

# Com alpha fixo em 0 corresponde à Poisson.
m2 <- mle2(PG_ll, start = start, data = L,
           fixed = list(alpha = 0), vecpar = TRUE)

# Mesma medida de ajuste e estimativas.
c(logLik(m2), logLik(m0))
## [1] -259.6165 -259.6165
cbind(coef(m2)[-1], coef(m0))
##                      [,1]        [,2]
## (Intercept)    3.95368723  3.95368724
## blocII        -0.02927855 -0.02927854
## blocIII       -0.07265048 -0.07265048
## blocIV        -0.12543798 -0.12543798
## blocV         -0.10794857 -0.10794857
## umid50         0.13404356  0.13404356
## umid62,5       0.21656458  0.21656458
## K30            0.27427290  0.27427290
## K60            0.30796674  0.30796674
## K120           0.32883188  0.32883188
## K180           0.25540441  0.25540441
## umid50:K30     0.06322288  0.06322288
## umid62,5:K30  -0.10747272 -0.10747272
## umid50:K60     0.16561471  0.16561471
## umid62,5:K60   0.10735066  0.10735066
## umid50:K120    0.14920392  0.14920392
## umid62,5:K120  0.11838578  0.11838578
## umid50:K180    0.30369921  0.30369921
## umid62,5:K180  0.19837927  0.19837927
# Poisson Generalizada.
m3 <- mle2(PG_ll, start = start, data = L, vecpar = TRUE)
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
## Warning in log(w): NaNs produced
## Warning in log(z): NaNs produced
# Teste para nulinidade do parâmetro de dispersão (H_0: alpha == 0).
anova(m3, m2)
## Likelihood Ratio Tests
## Model 1: m3, [PG_ll]: alpha+(Intercept)+blocII+blocIII+blocIV+
##           blocV+umid50+umid62,5+K30+K60+K120+K180+
##           umid50:K30+umid62,5:K30+umid50:K60+umid62,5:K60+
##           umid50:K120+umid62,5:K120+umid50:K180+umid62,5:K180
## Model 2: m2, [PG_ll]: (Intercept)+blocII+blocIII+blocIV+blocV+
##           umid50+umid62,5+K30+K60+K120+K180+umid50:K30+
##           umid62,5:K30+umid50:K60+umid62,5:K60+umid50:K120+
##           umid62,5:K120+umid50:K180+umid62,5:K180
##   Tot Df Deviance  Chisq Df Pr(>Chisq)
## 1     20   518.19                     
## 2     19   519.23 1.0402  1     0.3078
# est_stderr <- function(tb) {
#     sprintf("%s (%0.4f)",
#             formatC(tb[, 1], flag = " ", digits = 4, format = "f"),
#             tb[, 2])
# }
#
# L <- list("PoissonGLM" = rbind(NA, summary(m0)$coef),
#           "PoissonML*" = rbind(NA, summary(m2)@coef),
#           "PGeneraliz" = summary(m3)@coef)
#
# as.data.frame(sapply(L, est_stderr))

cbind("PoissonGLM" = c(NA, coef(m0)),
      "PoissonML" = coef(m2),
      "PGeneraliz" = coef(m3))
##                PoissonGLM   PoissonML   PGeneraliz
##                        NA  0.00000000 -0.001042601
## (Intercept)    3.95368724  3.95368723  3.953181451
## blocII        -0.02927854 -0.02927855 -0.027923784
## blocIII       -0.07265048 -0.07265048 -0.072056920
## blocIV        -0.12543798 -0.12543798 -0.127982907
## blocV         -0.10794857 -0.10794857 -0.105466353
## umid50         0.13404356  0.13404356  0.134139707
## umid62,5       0.21656458  0.21656458  0.216194659
## K30            0.27427290  0.27427290  0.273827867
## K60            0.30796674  0.30796674  0.307818797
## K120           0.32883188  0.32883188  0.328229190
## K180           0.25540441  0.25540441  0.255998709
## umid50:K30     0.06322288  0.06322288  0.064744844
## umid62,5:K30  -0.10747272 -0.10747272 -0.106284339
## umid50:K60     0.16561471  0.16561471  0.166265663
## umid62,5:K60   0.10735066  0.10735066  0.108202657
## umid50:K120    0.14920392  0.14920392  0.149064097
## umid62,5:K120  0.11838578  0.11838578  0.120194391
## umid50:K180    0.30369921  0.30369921  0.302905099
## umid62,5:K180  0.19837927  0.19837927  0.198624027
# Perfil para o parâmetro de dispersão.
plot(profile(m3, which = "alpha"))
## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced
abline(v = 0, lty = 2)

V <- cov2cor(vcov(m3))
corrplot.mixed(V, upper = "ellipse", col = "gray50")