%\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{Marginally Interpretable Linear Transformation Models} %\VignetteDepends{variables, basefun, mlt, tram, survival, lme4, gridExtra, lattice, latticeExtra, colorspace, HSAUR3, mvtnorm, ordinalCont, tramME, glmmTMB, geepack} \documentclass[article,nojss,shortnames]{jss} %% packages \usepackage{thumbpdf} \usepackage{amsfonts,amstext,amsmath,amssymb,amsthm} \usepackage{accents} \usepackage{color} \usepackage{rotating} \usepackage{verbatim} \usepackage[utf8]{inputenc} %% need no \usepackage{Sweave.sty} %%\usepackage[nolists]{endfloat} \newcommand{\cmd}[1]{\texttt{#1()}} \usepackage{tikz} \usetikzlibrary{shapes,arrows,chains} \usepackage{verbatim} <>= set.seed(290875) pkgs <- c("colorspace", "survival", "lme4", "tram", "gridExtra", "lattice", "latticeExtra", "mvtnorm", "ordinalCont", "tramME") pkgs <- sapply(pkgs, require, character.only = TRUE) @ \newcommand{\TODO}[1]{{\color{red} #1}} \newcommand\Torsten[1]{{\color{blue}Torsten: ``#1''}} \newcommand{\THcite}[2]{\citeauthor{#2} (\citeyear{#2})} \newcommand\norm[1]{\left\lVert#1\right\rVert} \newcommand{\CTM}{CTM Likelihood Boosting} \newcommand{\STM}{STM Likelihood Boosting} \newcommand{\etc}{\textit{etc.}} \usepackage{booktabs} \newcommand{\expit}{\text{expit}} %%% mlt %% rv \newcommand{\rZ}{Z} \newcommand{\rY}{Y} \newcommand{\rX}{\mX} \newcommand{\rz}{z} \newcommand{\ry}{y} \newcommand{\rx}{\xvec} \newcommand{\ru}{\uvec} \newcommand{\erx}{x} %% sigma algebra \newcommand{\sA}{\mathfrak{A}} \newcommand{\sAZ}{\mathfrak{B}} \newcommand{\sAY}{\mathfrak{C}} \newcommand{\esA}{A} \newcommand{\esAZ}{B} \newcommand{\esAY}{C} %% sample spaces \newcommand{\sam}{\Omega} \newcommand{\samZ}{\RR} \newcommand{\samY}{\Xi} \newcommand{\samX}{\chi} %% measureable spaces \newcommand{\ms}{(\sam, \sA)} \newcommand{\msZ}{(\samZ, \sAZ)} \newcommand{\msY}{(\samY, \sAY)} %% probability spaces \newcommand{\ps}{(\sam, \sA, \Prob)} \newcommand{\psZ}{(\samZ, \sAZ, \Prob_\rZ)} \newcommand{\psY}{(\samY, \sAY, \Prob_\rY)} %% distributions \newcommand{\pZ}{F} \newcommand{\pY}{F_\rY} \newcommand{\oY}{O_\rY} \newcommand{\oYx}{O_{\rY \mid \rX = \rx}} \newcommand{\hatpY}{\hat{F}_{\rY,N}} \newcommand{\hatpYx}{\hat{F}_{\rY \mid \rX = \rx, N}} \newcommand{\PYx}{\Prob_{\rY \mid \rX = \rx}} \newcommand{\PYX}{\Prob_{\rY, \rX}} \newcommand{\PY}{\Prob_{\rY}} \newcommand{\pN}{\Phi} \newcommand{\pSL}{F_{\SL}} \newcommand{\pMEV}{F_{\MEV}} \newcommand{\pExp}{F_{\ExpD}} \newcommand{\pSW}{F_{\SW}} \newcommand{\pYx}{F_{\rY \mid \rX = \rx}} \newcommand{\pYA}{F_{\rY \mid \rX = A}} \newcommand{\pYB}{F_{\rY \mid \rX = B}} \newcommand{\qZ}{F^{-1}_\rZ} \newcommand{\qY}{F^{-1}_\rY} \newcommand{\dZ}{f} \newcommand{\dY}{f_\rY} \newcommand{\hatdY}{\hat{f}_{\rY, N}} \newcommand{\dYx}{f_{\rY \mid \rX = \rx}} \newcommand{\hazY}{\lambda_\rY} \newcommand{\HazY}{\Lambda_\rY} \newcommand{\HazYx}{\Lambda_{\rY \mid \rX = \rx}} \newcommand{\hathazY}{\hat{\lambda}_{\rY, N}} \newcommand{\hatHazY}{\hat{\Lambda}_{\rY, N}} %% measures \newcommand{\measureY}{\mu} \newcommand{\lebesgue}{\mu_L} \newcommand{\counting}{\mu_C} %% trafo \newcommand{\g}{g} \newcommand{\h}{h} \newcommand{\s}{\svec} \newcommand{\hY}{h_\rY} \newcommand{\hx}{h_\rx} \newcommand{\hs}{\mathcal{H}} \newcommand{\basisy}{\avec} \newcommand{\bern}[1]{\avec_{\text{Bs},#1}} \newcommand{\bernx}[1]{\bvec_{\text{Bs},#1}} \newcommand{\basisx}{\bvec} \newcommand{\basisyx}{\cvec} \newcommand{\m}{m} \newcommand{\lik}{\mathcal{L}} \newcommand{\parm}{\varthetavec} \newcommand{\eparm}{\vartheta} \newcommand{\dimparm}{P} \newcommand{\dimparmx}{Q} \newcommand{\dimparmvar}{M} \newcommand{\dimparmrand}{R} \newcommand{\shiftparm}{\betavec} \newcommand{\scaleparm}{\xivec} \newcommand{\varparm}{\gammavec} \newcommand{\eshiftparm}{\beta} \newcommand{\escaleparm}{\xi} \newcommand{\ie}{\textit{i.e.}~} \newcommand{\eg}{\textit{e.g.}~} \renewcommand{\Prob}{\mathbb{P}} \newcommand{\Ex}{\mathbb{E}} \newcommand{\RR}{\mathbb{R}} \newcommand{\NN}{\mathbb{N}} \newcommand{\eps}{\varepsilon} \newcommand{\prodname}{tensor } \newcommand{\Null}{\mathbf{0}} \newcommand{\FI}{\mF} \usepackage{dsfont} \newcommand{\I}{\mathds{1}} \def \dsP {\text{$\mathds{P}$}} \def \dsE {\text{$\mathds{E}$}} \def \dsR {\text{$\mathds{R}$}} \def \dsN {\text{$\mathds{N}$}} % Math Operators \DeclareMathOperator{\logit}{logit} \DeclareMathOperator{\LRT}{LRT} \DeclareMathOperator{\RLRT}{RLRT} \DeclareMathOperator{\Cov}{Cov} \DeclareMathOperator{\Cor}{Cor} \DeclareMathOperator{\Var}{Var} \DeclareMathOperator{\EW}{\dsE} \DeclareMathOperator{\D}{D} \DeclareMathOperator{\Bias}{Bias} \DeclareMathOperator{\MSE}{MSE} \DeclareMathOperator{\PLS}{PLS} \DeclareMathOperator{\rank}{rank} \DeclareMathOperator{\ncol}{ncol} \DeclareMathOperator{\pen}{pen} \DeclareMathOperator{\const}{const} \DeclareMathOperator{\diag}{diag} \DeclareMathOperator{\blockdiag}{blockdiag} \DeclareMathOperator{\df}{df} \DeclareMathOperator{\trace}{tr} \DeclareMathOperator{\iid}{i.i.d.} \DeclareMathOperator{\ind}{ind.} \DeclareMathOperator{\obs}{obs} \DeclareMathOperator{\acos}{acos} \DeclareMathOperator{\spat}{spat} \DeclareMathOperator{\fix}{{fix}} \DeclareMathOperator{\ran}{{ran}} \DeclareMathOperator*{\argmin}{{arg\,min}} \DeclareMathOperator*{\argmax}{{arg\,max}} \DeclareMathOperator{\BIC}{{BIC}} \DeclareMathOperator{\DIC}{{DIC}} \DeclareMathOperator{\AIC}{{AIC}} \DeclareMathOperator{\mAIC}{{mAIC}} \DeclareMathOperator{\cAIC}{{cAIC}} % Distributions \DeclareMathOperator{\ND}{N} \DeclareMathOperator{\TND}{TN} \DeclareMathOperator{\UD}{U} \DeclareMathOperator{\GaD}{Ga} \DeclareMathOperator{\tD}{t} \DeclareMathOperator{\IGD}{IG} \DeclareMathOperator{\IWD}{IW} \DeclareMathOperator{\PoD}{Po} \DeclareMathOperator{\ExpD}{Exp} \DeclareMathOperator{\LapD}{Lap} \DeclareMathOperator{\MuD}{Mu} \DeclareMathOperator{\DirD}{Dir} \DeclareMathOperator{\PDD}{PD} \DeclareMathOperator{\BeD}{Be} \DeclareMathOperator{\BD}{B} \DeclareMathOperator{\DPD}{DP} \DeclareMathOperator{\KSD}{KS} \DeclareMathOperator{\SL}{SL} \DeclareMathOperator{\MEV}{MEV} \DeclareMathOperator{\SW}{SW} \DeclareMathOperator{\Chi1}{\chi^2_1} \DeclareMathOperator{\WD}{W} % Boldface vectors and matrices \def \avec {\text{\boldmath$a$}} \def \mA {\text{\boldmath$A$}} \def \bvec {\text{\boldmath$b$}} \def \mB {\text{\boldmath$B$}} \def \cvec {\text{\boldmath$c$}} \def \mC {\text{\boldmath$C$}} \def \dvec {\text{\boldmath$d$}} \def \mD {\text{\boldmath$D$}} \def \evec {\text{\boldmath$e$}} \def \mE {\text{\boldmath$E$}} \def \fvec {\text{\boldmath$f$}} \def \mF {\text{\boldmath$F$}} \def \gvec {\text{\boldmath$g$}} \def \mG {\text{\boldmath$G$}} \def \hvec {\text{\boldmath$h$}} \def \mH {\text{\boldmath$H$}} \def \ivec {\text{\boldmath$i$}} \def \mI {\text{\boldmath$I$}} \def \jvec {\text{\boldmath$j$}} \def \mJ {\text{\boldmath$J$}} \def \kvec {\text{\boldmath$k$}} \def \mK {\text{\boldmath$K$}} \def \lvec {\text{\boldmath$l$}} \def \mL {\text{\boldmath$L$}} \def \mvec {\text{\boldmath$m$}} \def \mM {\text{\boldmath$M$}} \def \nvec {\text{\boldmath$n$}} \def \mN {\text{\boldmath$N$}} \def \ovec {\text{\boldmath$o$}} \def \mO {\text{\boldmath$O$}} \def \pvec {\text{\boldmath$p$}} \def \mP {\text{\boldmath$P$}} \def \qvec {\text{\boldmath$q$}} \def \mQ {\text{\boldmath$Q$}} \def \rvec {\text{\boldmath$r$}} \def \mR {\text{\boldmath$R$}} \def \svec {\text{\boldmath$s$}} \def \mS {\text{\boldmath$S$}} \def \tvec {\text{\boldmath$t$}} \def \mT {\text{\boldmath$T$}} \def \uvec {\text{\boldmath$u$}} \def \mU {\text{\boldmath$U$}} \def \vvec {\text{\boldmath$v$}} \def \mV {\text{\boldmath$V$}} \def \wvec {\text{\boldmath$w$}} \def \mW {\text{\boldmath$W$}} \def \xvec {\text{\boldmath$x$}} \def \mX {\text{\boldmath$X$}} \def \yvec {\text{\boldmath$y$}} \def \mY {\text{\boldmath$Y$}} \def \zvec {\text{\boldmath$z$}} \def \mZ {\text{\boldmath$Z$}} \def \calA {\mathcal A} \def \calB {\mathcal B} \def \calC {\mathcal C} \def \calD {\mathcal D} \def \calE {\mathcal E} \def \calF {\mathcal F} \def \calG {\mathcal G} \def \calH {\mathcal H} \def \calI {\mathcal I} \def \calJ {\mathcal J} \def \calK {\mathcal K} \def \calL {\mathcal L} \def \calM {\mathcal M} \def \calN {\mathcal N} \def \calO {\mathcal O} \def \calP {\mathcal P} \def \calQ {\mathcal Q} \def \calR {\mathcal R} \def \calS {\mathcal S} \def \calT {\mathcal T} \def \calU {\mathcal U} \def \calV {\mathcal V} \def \calW {\mathcal W} \def \calX {\mathcal X} \def \calY {\mathcal Y} \def \calZ {\mathcal Z} \def \ahatvec {\text{\boldmath$\hat a$}} \def \mhatA {\text{\boldmath$\hat A$}} \def \bhatvec {\text{\boldmath$\hat b$}} \def \mhatB {\text{\boldmath$\hat B$}} \def \chatvec {\text{\boldmath$\hat c$}} \def \mhatC {\text{\boldmath$\hat C$}} \def \dhatvec {\text{\boldmath$\hat d$}} \def \mhatD {\text{\boldmath$\hat D$}} \def \ehatvec {\text{\boldmath$\hat e$}} \def \mhatE {\text{\boldmath$\hat E$}} \def \fhatvec {\text{\boldmath$\hat f$}} \def \mhatF {\text{\boldmath$\hat F$}} \def \ghatvec {\text{\boldmath$\hat g$}} \def \mhatG {\text{\boldmath$\hat G$}} \def \hhatvec {\text{\boldmath$\hat h$}} \def \mhatH {\text{\boldmath$\hat H$}} \def \ihatvec {\text{\boldmath$\hat i$}} \def \mhatI {\text{\boldmath$\hat I$}} \def \jhatvec {\text{\boldmath$\hat j$}} \def \mhatJ {\text{\boldmath$\hat J$}} \def \khatvec {\text{\boldmath$\hat k$}} \def \mhatK {\text{\boldmath$\hat K$}} \def \lhatvec {\text{\boldmath$\hat l$}} \def \mhatL {\text{\boldmath$\hat L$}} \def \mhatvec {\text{\boldmath$\hat m$}} \def \mhatM {\text{\boldmath$\hat M$}} \def \nhatvec {\text{\boldmath$\hat n$}} \def \mhatN {\text{\boldmath$\hat N$}} \def \ohatvec {\text{\boldmath$\hat o$}} \def \mhatO {\text{\boldmath$\hat O$}} \def \phatvec {\text{\boldmath$\hat p$}} \def \mhatP {\text{\boldmath$\hat P$}} \def \qhatvec {\text{\boldmath$\hat q$}} \def \mhatQ {\text{\boldmath$\hat Q$}} \def \rhatvec {\text{\boldmath$\hat r$}} \def \mhatR {\text{\boldmath$\hat R$}} \def \shatvec {\text{\boldmath$\hat s$}} \def \mhatS {\text{\boldmath$\hat S$}} \def \thatvec {\text{\boldmath$\hat t$}} \def \mhatT {\text{\boldmath$\hat T$}} \def \uhatvec {\text{\boldmath$\hat u$}} \def \mhatU {\text{\boldmath$\hat U$}} \def \vhatvec {\text{\boldmath$\hat v$}} \def \mhatV {\text{\boldmath$\hat V$}} \def \whatvec {\text{\boldmath$\hat w$}} \def \mhatW {\text{\boldmath$\hat W$}} \def \xhatvec {\text{\boldmath$\hat x$}} \def \mhatX {\text{\boldmath$\hat X$}} \def \yhatvec {\text{\boldmath$\hat y$}} \def \mhatY {\text{\boldmath$\hat Y$}} \def \zhatvec {\text{\boldmath$\hat z$}} \def \mhatZ {\text{\boldmath$\hat Z$}} \def \atildevec {\text{\boldmath$\tilde a$}} \def \mtildeA {\text{\boldmath$\tilde A$}} \def \btildevec {\text{\boldmath$\tilde b$}} \def \mtildeB {\text{\boldmath$\tilde B$}} \def \ctildevec {\text{\boldmath$\tilde c$}} \def \mtildeC {\text{\boldmath$\tilde C$}} \def \dtildevec {\text{\boldmath$\tilde d$}} \def \mtildeD {\text{\boldmath$\tilde D$}} \def \etildevec {\text{\boldmath$\tilde e$}} \def \mtildeE {\text{\boldmath$\tilde E$}} \def \ftildevec {\text{\boldmath$\tilde f$}} \def \mtildeF {\text{\boldmath$\tilde F$}} \def \gtildevec {\text{\boldmath$\tilde g$}} \def \mtildeG {\text{\boldmath$\tilde G$}} \def \htildevec {\text{\boldmath$\tilde h$}} \def \mtildeH {\text{\boldmath$\tilde H$}} \def \itildevec {\text{\boldmath$\tilde i$}} \def \mtildeI {\text{\boldmath$\tilde I$}} \def \jtildevec {\text{\boldmath$\tilde j$}} \def \mtildeJ {\text{\boldmath$\tilde J$}} \def \ktildevec {\text{\boldmath$\tilde k$}} \def \mtildeK {\text{\boldmath$\tilde K$}} \def \ltildevec {\text{\boldmath$\tilde l$}} \def \mtildeL {\text{\boldmath$\tilde L$}} \def \mtildevec {\text{\boldmath$\tilde m$}} \def \mtildeM {\text{\boldmath$\tilde M$}} \def \ntildevec {\text{\boldmath$\tilde n$}} \def \mtildeN {\text{\boldmath$\tilde N$}} \def \otildevec {\text{\boldmath$\tilde o$}} \def \mtildeO {\text{\boldmath$\tilde O$}} \def \ptildevec {\text{\boldmath$\tilde p$}} \def \mtildeP {\text{\boldmath$\tilde P$}} \def \qtildevec {\text{\boldmath$\tilde q$}} \def \mtildeQ {\text{\boldmath$\tilde Q$}} \def \rtildevec {\text{\boldmath$\tilde r$}} \def \mtildeR {\text{\boldmath$\tilde R$}} \def \stildevec {\text{\boldmath$\tilde s$}} \def \mtildeS {\text{\boldmath$\tilde S$}} \def \ttildevec {\text{\boldmath$\tilde t$}} \def \mtildeT {\text{\boldmath$\tilde T$}} \def \utildevec {\text{\boldmath$\tilde u$}} \def \mtildeU {\text{\boldmath$\tilde U$}} \def \vtildevec {\text{\boldmath$\tilde v$}} \def \mtildeV {\text{\boldmath$\tilde V$}} \def \wtildevec {\text{\boldmath$\tilde w$}} \def \mtildeW {\text{\boldmath$\tilde W$}} \def \xtildevec {\text{\boldmath$\tilde x$}} \def \mtildeX {\text{\boldmath$\tilde X$}} \def \ytildevec {\text{\boldmath$\tilde y$}} \def \mtildeY {\text{\boldmath$\tilde Y$}} \def \ztildevec {\text{\boldmath$\tilde z$}} \def \mtildeZ {\text{\boldmath$\tilde Z$}} \def \alphavec {\text{\boldmath$\alpha$}} \def \betavec {\text{\boldmath$\beta$}} \def \gammavec {\text{\boldmath$\gamma$}} \def \deltavec {\text{\boldmath$\delta$}} \def \epsilonvec {\text{\boldmath$\epsilon$}} \def \varepsilonvec {\text{\boldmath$\varepsilon$}} \def \zetavec {\text{\boldmath$\zeta$}} \def \etavec {\text{\boldmath$\eta$}} \def \thetavec {\text{\boldmath$\theta$}} \def \varthetavec {\text{\boldmath$\vartheta$}} \def \iotavec {\text{\boldmath$\iota$}} \def \kappavec {\text{\boldmath$\kappa$}} \def \lambdavec {\text{\boldmath$\lambda$}} \def \muvec {\text{\boldmath$\mu$}} \def \nuvec {\text{\boldmath$\nu$}} \def \xivec {\text{\boldmath$\xi$}} \def \pivec {\text{\boldmath$\pi$}} \def \varpivec {\text{\boldmath$\varpi$}} \def \rhovec {\text{\boldmath$\rho$}} \def \varrhovec {\text{\boldmath$\varrho$}} \def \sigmavec {\text{\boldmath$\sigma$}} \def \varsigmavec {\text{\boldmath$\varsigma$}} \def \tauvec {\text{\boldmath$\tau$}} \def \upsilonvec {\text{\boldmath$\upsilon$}} \def \phivec {\text{\boldmath$\phi$}} \def \varphivec {\text{\boldmath$\varphi$}} \def \psivec {\text{\boldmath$\psi$}} \def \chivec {\text{\boldmath$\chi$}} \def \omegavec {\text{\boldmath$\omega$}} \def \alphahatvec {\text{\boldmath$\hat \alpha$}} \def \betahatvec {\text{\boldmath$\hat \beta$}} \def \gammahatvec {\text{\boldmath$\hat \gamma$}} \def \deltahatvec {\text{\boldmath$\hat \delta$}} \def \epsilonhatvec {\text{\boldmath$\hat \epsilon$}} \def \varepsilonhatvec {\text{\boldmath$\hat \varepsilon$}} \def \zetahatvec {\text{\boldmath$\hat \zeta$}} \def \etahatvec {\text{\boldmath$\hat \eta$}} \def \thetahatvec {\text{\boldmath$\hat \theta$}} \def \varthetahatvec {\text{\boldmath$\hat \vartheta$}} \def \iotahatvec {\text{\boldmath$\hat \iota$}} \def \kappahatvec {\text{\boldmath$\hat \kappa$}} \def \lambdahatvec {\text{\boldmath$\hat \lambda$}} \def \muhatvec {\text{\boldmath$\hat \mu$}} \def \nuhatvec {\text{\boldmath$\hat \nu$}} \def \xihatvec {\text{\boldmath$\hat \xi$}} \def \pihatvec {\text{\boldmath$\hat \pi$}} \def \varpihatvec {\text{\boldmath$\hat \varpi$}} \def \rhohatvec {\text{\boldmath$\hat \rho$}} \def \varrhohatvec {\text{\boldmath$\hat \varrho$}} \def \sigmahatvec {\text{\boldmath$\hat \sigma$}} \def \varsigmahatvec {\text{\boldmath$\hat \varsigma$}} \def \tauhatvec {\text{\boldmath$\hat \tau$}} \def \upsilonhatvec {\text{\boldmath$\hat \upsilon$}} \def \phihatvec {\text{\boldmath$\hat \phi$}} \def \varphihatvec {\text{\boldmath$\hat \varphi$}} \def \psihatvec {\text{\boldmath$\hat \psi$}} \def \chihatvec {\text{\boldmath$\hat \chi$}} \def \omegahatvec {\text{\boldmath$\hat \omega$}} \def \alphatildevec {\text{\boldmath$\tilde \alpha$}} \def \betatildevec {\text{\boldmath$\tilde \beta$}} \def \gammatildevec {\text{\boldmath$\tilde \gamma$}} \def \deltatildevec {\text{\boldmath$\tilde \delta$}} \def \epsilontildevec {\text{\boldmath$\tilde \epsilon$}} \def \varepsilontildevec {\text{\boldmath$\tilde \varepsilon$}} \def \zetatildevec {\text{\boldmath$\tilde \zeta$}} \def \etatildevec {\text{\boldmath$\tilde \eta$}} \def \thetatildevec {\text{\boldmath$\tilde \theta$}} \def \varthetatildevec {\text{\boldmath$\tilde \vartheta$}} \def \iotatildevec {\text{\boldmath$\tilde \iota$}} \def \kappatildevec {\text{\boldmath$\tilde \kappa$}} \def \lambdatildevec {\text{\boldmath$\tilde \lambda$}} \def \mutildevec {\text{\boldmath$\tilde \mu$}} \def \nutildevec {\text{\boldmath$\tilde \nu$}} \def \xitildevec {\text{\boldmath$\tilde \xi$}} \def \pitildevec {\text{\boldmath$\tilde \pi$}} \def \varpitildevec {\text{\boldmath$\tilde \varpi$}} \def \rhotildevec {\text{\boldmath$\tilde \rho$}} \def \varrhotildevec {\text{\boldmath$\tilde \varrho$}} \def \sigmatildevec {\text{\boldmath$\tilde \sigma$}} \def \varsigmatildevec {\text{\boldmath$\tilde \varsigma$}} \def \tautildevec {\text{\boldmath$\tilde \tau$}} \def \upsilontildevec {\text{\boldmath$\tilde \upsilon$}} \def \phitildevec {\text{\boldmath$\tilde \phi$}} \def \varphitildevec {\text{\boldmath$\tilde \varphi$}} \def \psitildevec {\text{\boldmath$\tilde \psi$}} \def \chitildevec {\text{\boldmath$\tilde \chi$}} \def \omegatildevec {\text{\boldmath$\tilde \omega$}} \def \mGamma {\mathbf{\Gamma}} \def \mDelta {\mathbf{\Delta}} \def \mTheta {\mathbf{\Theta}} \def \mLambda {\mathbf{\Lambda}} \def \mXi {\mathbf{\Xi}} \def \mPi {\mathbf{\Pi}} \def \mSigma {\mathbf{\Sigma}} \def \mUpsilon {\mathbf{\Upsilon}} \def \mPhi {\mathbf{\Phi}} \def \mPsi {\mathbf{\Psi}} \def \mOmega {\mathbf{\Omega}} \def \mhatGamma {\mathbf{\hat \Gamma}} \def \mhatDelta {\mathbf{\hat \Delta}} \def \mhatTheta {\mathbf{\hat \Theta}} \def \mhatLambda {\mathbf{\hat \Lambda}} \def \mhatXi {\mathbf{\hat \Xi}} \def \mhatPi {\mathbf{\hat \Pi}} \def \mhatSigma {\mathbf{\hat \Sigma}} \def \mhatUpsilon {\mathbf{\hat \Upsilon}} \def \mhatPhi {\mathbf{\hat \Phi}} \def \mhatPsi {\mathbf{\hat \Psi}} \def \mhatOmega {\mathbf{\hat \Omega}} \def \nullvec {\mathbf{0}} \def \onevec {\mathbf{1}} %%% theorems \newtheorem{lem}{Lemma} \newtheorem{thm}{Theorem} \newtheorem{coro}{Corollary} \newtheorem{defn}{Definition} \newtheorem{remark}{Remark} \newcommand{\ubar}[1]{\underaccent{\bar}{#1}} \renewcommand{\thefootnote}{} %% code commands \newcommand{\Rclass}[1]{`\code{#1}'} %% JSS \author{Luisa Barbanti \\ Universit\"at Z\"urich \And Torsten Hothorn \\ Universit\"at Z\"urich} \Plainauthor{Barbanti and Hothorn} \title{Some Applications of Marginally Interpretable Linear Transformation Models for Clustered Observations} \Plaintitle{Marginally Interpretable Transformation Models} \Shorttitle{Marginally Interpretable Transformation Models} \Abstract{ Owing to their generality, transformation models can be used to set-up and compute many interesting regression models for discrete and continuous responses. This document focuses on the analysis of clustered observations. Marginal predictive distributions are defined by transformation models and their joint normal distribution depends on a structured covariance matrix. Applications with skewed, bounded, and survival continuous outcomes as well as binary and ordered categorical responses are presented. Data is analysed by a proof-of-concept implementation of parametric linear transformation models for clustered observations available in the \pkg{tram} add-on package to the \proglang{R} system for statistical computing. } \Keywords{conditional mixed models, marginal models, marginal predictive distributions, survival analysis, categorical data analysis} \Plainkeywords{conditional mixed models, marginal models, marginal predictive distributions, survival analysis, categorical data analysis} \Address{ Luisa Barbanti, Torsten Hothorn\\ Institut f\"ur Epidemiologie, Biostatistik und Pr\"avention \\ Universit\"at Z\"urich \\ Hirschengraben 84, CH-8001 Z\"urich, Switzerland \\ \texttt{Torsten.Hothorn@R-project.org} } \begin{document} <>= year <- substr(packageDescription("tram")$Date, 1, 4) version <- packageDescription("tram")$Version @ \footnote{Please cite this document as: Luisa Barbanti and Torsten Hothorn (\Sexpr{year}) Some Applications of Marginally Interpretable Linear Transformation Models for Clustered Observations. \textsf{R} package vignette version \Sexpr{version}, URL \url{https://CRAN.R-project.org/package=tram}.} % \input{todo} <>= if (any(!pkgs)) { cat(paste("Package(s)", paste(names(pkgs)[!pkgs], collapse = ", "), "not available, stop processing.", "\\end{document}\n")) knitr::knit_exit() } if (!interactive() && .Platform$OS.type != "unix") { cat("Vignette only compiled under Unix alikes.") knitr::knit_exit() } @ <>= trellis.par.set(list(plot.symbol = list(col=1,pch=20, cex=0.7), box.rectangle = list(col=1), box.umbrella = list(lty=1, col=1), strip.background = list(col = "white"))) ltheme <- canonical.theme(color = FALSE) ## in-built B&W theme ltheme$strip.background$col <- "transparent" ## change strip bg lattice.options(default.theme = ltheme) knitr::opts_chunk$set(echo = TRUE, results = 'markup', error = FALSE, warning = FALSE, message = FALSE, tidy = FALSE, cache = FALSE, size = "small", fig.width = 6, fig.height = 4, fig.align = "center", out.width = NULL, ###'.6\\linewidth', out.height = NULL, fig.scap = NA) knitr::render_sweave() # use Sweave environments knitr::set_header(highlight = '') # do not \usepackage{Sweave} ## R settings options(prompt = "R> ", continue = "+ ", useFancyQuotes = FALSE) # JSS style options(width = 75) ### ecdf plots myprepanel <- function (x, y, f.value = NULL, ...) { ans <- prepanel.default.qqmath(x, f.value = f.value, distribution = qunif) with(ans, list(xlim = ylim, ylim = c(0, 1), dx = dy, dy = dx)) } mypanel <- function (x, y, f.value = NULL, type = "s", groups = NULL, qtype = 7, ref = TRUE, ...) { if (ref) { reference.line <- trellis.par.get("reference.line") do.call(panel.abline, c(list(h = c(0, 1)), reference.line)) } x <- as.numeric(x) distribution <- qunif nobs <- sum(!is.na(x)) if (!is.null(groups)) { panel.superpose(x, y = NULL, f.value = f.value, type = type, distribution = distribution, qtype = qtype, groups = groups, panel.groups = panel.ecdfplot, ...) } else if (nobs) { if (is.null(f.value)) { panel.xyplot(x = sort(x), y = cumsum(y[order(x)]) / sum(y), type = type, ...) } else { p <- if (is.numeric(f.value)) f.value else f.value(nobs) panel.xyplot(x = quantile(x, p, names = FALSE, type = qtype, na.rm = TRUE), y = distribution(p), type = type, ...) } } } col <- diverge_hcl(2, h = c(246, 40), c = 120, l = c(65, 90), alpha = .75) @ \section{Introduction} The purpose of this document is to compare marginally interpretable linear transformation models for clustered observations \citep{Hothorn_2019_mtram} to conventional conditional formulations of mixed-effects models where such an overlap exists. In addition, novel transformation models going beyond the capabilities of convential mixed-effects models are estimated and interpreted. A proof-of-concept implementation available in package \pkg{tram} \citep{pkg:tram} is applied. % use mtram in package tram The results presented in this document can be reproduced from the \code{mtram} demo <>= install.packages("tram") demo("mtram", package = "tram") @ \section{Normal and Non-normal Mixed-effects Models} First we consider mixed-effects models for reaction times in the sleep deprivation study \citep{Belenky_Wesensten_Thorne_2003}. The average reaction times to a specific task over several days of sleep deprivation are given for $i = 1, \dots, N = 18$ subjects in Figure~\ref{fig:sleepstudy}. The data are often used to illustrate conditional normal linear mixed-effects models with correlated random intercepts and slopes. %of the form (\ref{fm:normal}) \begin{figure}[t] <>= library("lme4") xyplot(Reaction ~ Days | Subject, data = sleepstudy, xlab = "Days of sleep deprivation", ylab = "Average reaction time (in ms)") @ \caption{Sleep deprivation: Average reaction times to a specific task over several days of sleep deprivation for $18$ subjects from \cite{Belenky_Wesensten_Thorne_2003}. \label{fig:sleepstudy}} \end{figure} The classical normal linear random-intercept/random-slope model, treating the study participants as independent observations, is fitted by maximum likelihood to the data using the \cmd{lmer} function from the \pkg{lme4} add-on package \citep{pkg:lme4}: % <>= sleep_lmer <- lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy, REML = FALSE) @ % The corresponding conditional model for subject $i$ reads % \begin{eqnarray*} \Prob(\text{Reaction} \le \ry \mid \text{day}, i) = \Phi\left(\frac{\ry - \alpha - \beta \text{day} - \alpha_i - \beta_i \text{day}}{\sigma}\right), \quad (\alpha_i, \beta_i) \sim \ND_2(\nullvec, \mG(\varparm)) \end{eqnarray*} % with $\sigma^{-2}\mG = \mLambda(\varparm) \mLambda(\varparm)^\top$ and % \begin{eqnarray*} \mLambda(\varparm) = \left( \begin{array}{cc} \gamma_1 & 0 \\ \gamma_2 & \gamma_3 \end{array} \right), \quad \varparm = (\gamma_1, \gamma_2, \gamma_3)^\top. \end{eqnarray*} The same model, however using the alternative parameterisation and an independent (of \pkg{lme4}, only the \cmd{update} method for Cholesky factors is reused) gradient-based maximisation of the log-likelihood, is estimated in a two-step approach as <>= library("tram") @ <>= sleep_LM <- Lm(Reaction ~ Days, data = sleepstudy) sleep_LMmer <- mtram(sleep_LM, ~ (Days | Subject), data = sleepstudy) @ % The first call to \cmd{Lm} computes the equivalent of a normal linear regression model parameterised as a linear transformation model \emph{ignoring} the longitudinal nature of the observations. The purpose if to set-up the necessary model infrastructure (model matrices, inverse link functions, etc.) and to compute reasonable starting values for the fixed effects. The second call to \cmd{mtram} specifies the random effects structure (here a correlated pair of random intercept for subject and random slope for days) and optimises the likelihood for all model parameters $\eparm_1, \tilde{\alpha}, \tilde{\beta}$, and $\varparm$ in the model (here also looking at the conditional model for subject $i$) % \begin{eqnarray*} \Prob(\text{Reaction} \le \ry \mid \text{day}, i) = \Phi\left(\eparm_1 \ry + \tilde{\alpha} - \tilde{\beta} \text{day} - \tilde{\alpha}_i - \tilde{\beta}_i \text{day}\right), \quad (\tilde{\alpha}_i, \tilde{\beta}_i) \sim \ND_2(\nullvec, \mLambda(\varparm) \mLambda(\varparm)) \end{eqnarray*} % that is, all fixed and random effect parameters are divided by the residual standard deviation $\sigma$ (this is the reparameterisation applied by \cmd{Lm}). Of course, the parameter $\eparm_1$, the inverse residual standard deviation, is ensured to be positive via an additional constraint in the optimiser maximising the log-likelihood. % The log-likelihoods of the two models fitted by \cmd{lmer} and \cmd{mtram} are very close <>= logLik(sleep_lmer) logLik(sleep_LMmer) @ Looking at the model coefficients, the two procedures lead to almost identical inverse residual standard deviations <>= (sdinv <- 1 / summary(sleep_lmer)$sigma) coef(sleep_LMmer)["Reaction"] @ and fixed effects (the slope can be interpreted as inverse coefficient of variation) <>= fixef(sleep_lmer) * c(-1, 1) * sdinv coef(sleep_LMmer)[c("(Intercept)", "Days")] @ The random-effect parameters $\varparm$ are also reasonably close <>= sleep_lmer@theta coef(sleep_LMmer)[-(1:3)] @ Consequently, the variance-covariance and correlation matrices <>= sleep_LMmer$G * (1 / sdinv)^2 cov2cor(sleep_LMmer$G * (1 / sdinv)^2) unclass(VarCorr(sleep_lmer))$Subject @ are practically equivalent. This result indicates the correctness of the alternative implementation of normal linear mixed-effects models in the transformation model framework: \cmd{mtram} reuses some infrastructure from \pkg{lme4} and \pkg{Matrix}, most importantly fast update methods for Cholesky factors, but the likelihood and corresponding optimisation relies on an independent implementation. So why are we doing this? Because \cmd{mtram} is able to deal with models or likelihoods not available in \pkg{lme4}, for example the likelihood for interval-censored observations. Let's assume that the timing of the reaction times was less accurate than suggested by the numerical representation of the results. The following code <>= library("survival") sleepstudy$Reaction_I <- with(sleepstudy, Surv(Reaction - 20, Reaction + 20, type = "interval2")) sleepstudy$Reaction_I[1:5] @ converts the outcome to interval-censored values, where each interval has length $40$. The above mixed model can now be estimated by maximising the likelihood corresponding to interval-censored observations: <>= sleep_LM_I <- Lm(Reaction_I ~ Days, data = sleepstudy) sleep_LMmer_I <- mtram(sleep_LM_I, ~ (Days | Subject), data = sleepstudy) @ Of course, the log-likelihood changes (because this is a log-probability and not a log-density of a continuous distribution) but the parameter estimates are reasonably close <>= logLik(sleep_LMmer_I) coef(sleep_LMmer_I) coef(sleep_LMmer) @ The next question is if the normal assumption for reaction times is appropriate. In the transformation world, this assumption is simple to assess because we can easily (theoretically and in-silico) switch to the non-normal linear mixed-effects transformation model % \begin{eqnarray*} \Prob(\text{Reaction} \le \ry \mid \text{day}, i) = \Phi\left(\h(\ry) - \tilde{\beta} \text{day} - \tilde{\alpha}_i - \tilde{\beta}_i \text{day}\right), \quad (\tilde{\alpha}_i, \tilde{\beta}_i) \sim \ND_2(\nullvec, \mLambda(\varparm) \mLambda(\varparm)) \end{eqnarray*} % where $\h(\ry) = \basisy(\ry)^\top \parm$ represents a monotone non-decreasing transformation function. The function implementing such a more flexible model in named in honor of the first paper on the analysis of transformed responses by \cite{BoxCox_1964} but it \emph{does not} simply apply what is known as a Box-Cox transformation. Bernstein polynomials $\h(\ry) = \basisy(\ry)^\top \parm$ under suitable constraints \citep{Hothorn_Moest_Buehlmann_2017} are applied instead by <>= sleep_BC <- BoxCox(Reaction ~ Days, data = sleepstudy) sleep_BCmer <- mtram(sleep_BC, ~ (Days | Subject), data = sleepstudy) logLik(sleep_BCmer) @ % The increase in the log-likelihood compared to the normal model is not a big surprise. Plotting the transformation function $\h(\ry) = \basisy(\ry)^\top \parm$ as a function of reaction time can help to assess deviations from normality because the latter assumption implies a linear transformation function. Figure~\ref{fig:sleepstudy_trafo} clearly indicates that models allowing a certain skewness of reaction times will provide a better fit to the data. This might also not come as a big surprise to experienced data analysts. \begin{figure}[t] <>= tmp <- as.mlt(sleep_BC) cf <- coef(tmp) coef(tmp) <- coef(sleep_BCmer)[names(cf)] plot(tmp, newdata = data.frame(Days = 0), type = "trafo", col = "black", xlab = "Average reaction time (in ms)", ylab = expression(h(y))) rug(sleepstudy$Reaction, col = rgb(.1, .1, .1, .1)) @ \caption{Sleep deprivation: Data-driven transformation $\hat{\h}$ of average reaction times to sleep deprivation. The non-linearity induces a non-normal marginal distribution function of reaction times. \label{fig:sleepstudy_trafo}} \end{figure} Such probit-type mixed-effects models have been studied before, mostly by merging a Box-Cox power transformation $\h$ with a grid-search over REML estimates \citep{Gurka_Edwards_2006}, a conditional likelihood \citep{Hutmacher_French_2011}, or a grid-search maximising the profile likelihood \citep{Maruo_Yamaguchi_2017}. Recently, \THcite{Tang, Wu, and Chen}{Tang_Wu_Chen_2018} and \THcite{Wu and Wang}{Wu_Wang_2019} proposed a monotone spline parameterisation of $\h$ in a Bayesian context. The model presented here was estimated by simultaneously maximising the log-likelihood \citep{Hothorn_2019_mtram} %(\ref{fm:cll}) with respect to the parameters $\parm$, $\eshiftparm$, and $\varparm$. For a linear Bernstein polynomial of order one, the models obtained with this approach and classical maximum likelihood estimation in normal linear mixed-effects models are equivalent (up to reparameterisation of $\eshiftparm$).% \citep{vign:tram}. However, what does this finding mean in terms of a direct comparison of the model and the data? Looking at the marginal cumulative distribution functions of average reaction time conditional on days of sleep deprivation in Figure~\ref{fig:sleepstudy_ecdf} one finds that the non-normal marginal transformation models provided a better fit to the marginal empirical cumulative distribution functions than the normal marginal models. Especially for short reaction times in the first week of sleep deprivation, the orange marginal cumulative distribution is much closer to the empirical cumulative distribution function representing the marginal distribution of reaction times at each single day of study participation. \begin{figure}[t] <>= days <- 0:9 q <- seq(from = min(sleepstudy$Reaction), to = max(sleepstudy$Reaction), length.out = 100) meth <- c("Normal linear mixed-effects model", "Non-normal linear transformation model") ex <- expand.grid(Reaction = q, Days = days, Method = factor(meth, levels = meth, labels = meth)) U <- cbind(1, days) ### Linear tmp <- as.mlt(sleep_LM) cf <- coef(tmp) coef(tmp) <- coef(sleep_LMmer)[names(cf)] SLM <- tcrossprod(U %*% as(sleep_BCmer$G, "matrix"), U) + diag(nrow(U)) sd <- sqrt(diag(SLM)) h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo") prob_LM <- pnorm(t(t(h) / sd )) ### BoxCox tmp <- as.mlt(sleep_BC) cf <- coef(tmp) coef(tmp) <- coef(sleep_BCmer)[names(cf)] SBC <- tcrossprod(U %*% as(sleep_BCmer$G, "matrix"), U) + diag(nrow(U)) sd <- sqrt(diag(SBC)) h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo") prob_BC <- pnorm(t(t(h) / sd )) ex$prob <- c(prob_LM, prob_BC) plotfun <- function(prob, data, ...) { fm <- as.formula(paste(prob, "~ Reaction | Days")) xyplot(fm, data = data, type = "l", panel = function(x, y, subscripts, ...) { tmp <- subset(sleepstudy, Days == unique(nd[subscripts, "Days"])) mypanel(tmp$Reaction, rep(1, nrow(tmp)), lwd = 3, col = grey) panel.xyplot(x, y, subscripts = subscripts, ...) }, col = col, xlab = "Average reaction time (in ms)", ylab = "Marginal distribution function", lwd = 2, groups = Method, ...) } grey <- rgb(.75, .75, .75) nd <- ex plotfun("prob", ex, layout = c(5, 2), par.settings = simpleTheme(col=c(grey, col), lwd = 3), auto.key = list(text = c("Empirical cumulative distribution function", levels(nd$Method)), points = FALSE, lines = TRUE, space = "top")) @ \caption{Sleep deprivation: Marginal distribution of reaction times, separately for each day of study participation. The grey step-function corresponds to the empirical cumulative distribution function, the blue line to the marginal cumulative distribution of a normal linear mixed-effects model, and the orange line to a non-normal linear mixed-effects transformation model. \label{fig:sleepstudy_ecdf}} \end{figure} It should be noted that the small positive correlation between random intercept and random slope observed in the normal linear mixed-effects model turned into a negative correlation in this non-normal model <>= cov2cor(sleep_BCmer$G) @ What is the uncertainty associated with this parameter? The correlation is a non-linear function of $\varparm$ and therefore the direct computation of confidence intervals questionable. However, we can extract an estimate of the covariance of the estimated model parameters from the model and, relying on the asymptotic normality of the maximum likelihood estimators, we can sample from the asymptotic distribution of the variance of the random intercept $\tilde{\alpha}$, the random slope $\tilde{\beta}$, and their correlation <>= library("mvtnorm") VC <- vcov(sleep_BCmer) idx <- (nrow(VC) - 2):nrow(VC) Rcoef <- rmvnorm(1000, mean = coef(sleep_BCmer), sigma = VC)[,idx] ret <- apply(Rcoef, 1, function(gamma) { L <- matrix(c(gamma[1:2], 0, gamma[3]), nrow = 2) V <- tcrossprod(L) c(diag(V), cov2cor(V)[1,2]) }) @ The $95\%$ confidence intervals <>= ### variance random intercept quantile(ret[1,], c(.025, .5, .975)) ### variance random slope quantile(ret[2,], c(.025, .5, .975)) ### correlation random intercept / random slope quantile(ret[3,], c(.025, .5, .975)) @ indicate rather strong unobserved heterogeneity affecting the intercept and less pronouned variability in the slope. There is only weak information about the correlation of the two random effects contained in the data. The downside of this approach is that, although the model is nicely interpretable on the scale of marginal or conditional distribution functions, the direct interpretation of the fixed effect $\tilde{\beta}$ is not very straightforward because it corresponds to the conditional mean \emph{after} transforming the outcome. This interpretability issue can be addressed by exchanging the probit link to a logit link <>= sleep_C <- Colr(Reaction ~ Days, data = sleepstudy) sleep_Cmer <- mtram(sleep_C, ~ (Days | Subject), data = sleepstudy) logLik(sleep_Cmer) @ Here, the in-sample log-likelihood increases compared to the probit model and the marginal distributions obtained from this model are shown in Figure~\ref{fig:sleepstudy_ecdf-2}. How to interpret models of this type is discussed in Section~\ref{sec:logit}. \begin{figure}[t] <>= days <- 0:9 q <- seq(from = min(sleepstudy$Reaction), to = max(sleepstudy$Reaction), length.out = 100) meth <- c("Normal linear mixed-effects model", "Probit transformation model", "Marginal logit transformation model") ex <- expand.grid(Reaction = q, Days = days, Method = factor(meth, levels = meth, labels = meth)) U <- cbind(1, days) ### Linear tmp <- as.mlt(sleep_LM) cf <- coef(tmp) coef(tmp) <- coef(sleep_LMmer)[names(cf)] SLM <- tcrossprod(U %*% as(sleep_BCmer$G, "matrix"), U) + diag(nrow(U)) sd <- sqrt(diag(SLM)) h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo") prob_LM <- pnorm(t(t(h) / sd )) ### BoxCox tmp <- as.mlt(sleep_BC) cf <- coef(tmp) coef(tmp) <- coef(sleep_BCmer)[names(cf)] SBC <- tcrossprod(U %*% as(sleep_BCmer$G, "matrix"), U) + diag(nrow(U)) sd <- sqrt(diag(SBC)) h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo") prob_BC <- pnorm(t(t(h) / sd )) ### Colr tmp <- as.mlt(sleep_C) cf <- coef(tmp) coef(tmp) <- coef(sleep_Cmer)[names(cf)] SBC <- tcrossprod(U %*% as(sleep_Cmer$G, "matrix"), U) + diag(nrow(U)) sd <- sqrt(diag(SBC)) h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo") prob_C <- plogis(t(t(h) / sd )) ex$prob <- c(prob_LM, prob_BC, prob_C) plotfun <- function(prob, data, ...) { fm <- as.formula(paste(prob, "~ Reaction | Days")) xyplot(fm, data = data, type = "l", panel = function(x, y, subscripts, ...) { tmp <- subset(sleepstudy, Days == unique(nd[subscripts, "Days"])) mypanel(tmp$Reaction, rep(1, nrow(tmp)), lwd = 3, col = grey) panel.xyplot(x, y, subscripts = subscripts, ...) }, col = c(col, col[2]), xlab = "Average reaction time (in ms)", ylab = "Marginal distribution function", lwd = 2, groups = Method, lty = c(1, 1, 3), ...) } grey <- rgb(.75, .75, .75) nd <- ex plotfun("prob", ex, layout = c(5, 2), par.settings = simpleTheme(col=c(grey, col, col[2]), lwd = 3, lty = c(1, 1, 1, 3)), auto.key = list(text = c("Empirical cumulative distribution function", levels(nd$Method)), points = FALSE, lines = TRUE, space = "top")) @ \caption{Sleep deprivation: Marginal distribution of reaction times, separately for each day of study participation. The grey step-function corresponds to the empirical cumulative distribution function, the blue line to the marginal cumulative distribution of a normal linear mixed-effects model, and the orange lines to a non-normal probit (solid) and marginal logit (dotted) transformation model. \label{fig:sleepstudy_ecdf-2}} \end{figure} \section{Models for Binary Outcomes} Here we compare different implementations of binary marginal and mixed models for the notoriously difficult toe nail data \citep{backer_vroey_1998}. The outcome was categorised to two levels \citep[this being probably the root of all troubles, as quasi-separation issues have been reported by][]{Sauter_Held_2016}. A conditional density plot (Figure~\ref{fig:toenail}) suggests an improvement in both treatment groups over time, however with a more rapid advance in patients treated with terbinafine. \begin{figure}[t] <>= data("toenail", package = "HSAUR3") layout(matrix(1:2, ncol = 2)) trt <- levels(toenail$treatment) cdplot(outcome ~ time, data = subset(toenail, treatment == trt[1]), main = trt[1], xlab = "Time", ylab = "Toe nail infection") cdplot(outcome ~ time, data = subset(toenail, treatment == trt[2]), main = trt[2], xlab = "Time", ylab = "") @ \caption{Toe nail data: Conditional density plot of two outcome classes (none or mild vs.~moderate or severe) under two treatments. \label{fig:toenail}} \end{figure} \subsection{Random Intercept Probit Models} We are first interested in binary probit models featuring fixed main and interaction effects $\eshiftparm_1$, $\eshiftparm_2$, and $\eshiftparm_3$ of treatment (itraconazole vs.~terbinafine) and time. Subject-specific random intercept models were estimated by the \code{glmer} function from package \pkg{lme4} \citep{pkg:lme4}, by the \code{glmmTMB} function from package \pkg{glmmTMB} \citep{pkg:glmmTMB}, and by direct maximisation of the exact discrete log-likelihood given in Appendix B of \cite{Hothorn_2019_mtram}. % discrete log-likelihood (\ref{fm:dll}) given in Appendix~\ref{app:cens}. The random intercept probit model fitted by Laplace and Adaptive Gauss-Hermite Quadrature (AGQ) approximations to the likelihood give quite different results: <>= ### Laplace toenail_glmer_RI_1 <- glmer(outcome ~ treatment * time + (1 | patientID), data = toenail, family = binomial(link = "probit"), nAGQ = 1) summary(toenail_glmer_RI_1) toenail_glmer_RI_1@theta ### Adaptive Gaussian Quadrature toenail_glmer_RI_2 <- glmer(outcome ~ treatment * time + (1 | patientID), data = toenail, family = binomial(link = "probit"), nAGQ = 20) summary(toenail_glmer_RI_2) toenail_glmer_RI_2@theta @ Package \pkg{glmmTMB} optimises the Laplace approximation utilising the Template Model Builder \pkg{TMB} package: <>= library("glmmTMB") toenail_glmmTMB_RI_3 <- glmmTMB(outcome ~ treatment * time + (1 | patientID), data = toenail, family = binomial(link = "probit")) summary(toenail_glmmTMB_RI_3) @ Surprisingly, this model is very close to the one obtained by AGQ and quite off from the Laplace implementation in \pkg{lme4} (\code{nAGQ = 1} means Laplace). Because of the probit link, this binary generalised linear model is equivalent to a linear transformation model and we can thus use the exact likelihood implemented for the latter model in \cmd{mtram} for parameter estimation (it is still a bit nasty to set-up a constant transformation function $\h(\ry) = \alpha$, we plan to add a more convenient interface later) <>= m <- ctm(as.basis(~ outcome, data = toenail), shifting = ~ treatment * time, data = toenail, todistr = "Normal", negative = TRUE) toenail_probit <- mlt(m, data = toenail, fixed = c("outcomemoderate or severe" = 0)) toenail_mtram_RI <- mtram(toenail_probit, ~ (1 | patientID), data = toenail) coef(toenail_mtram_RI) @ For this random intercept model, the exact likelihood is defined as a one-dimensional integral over the unit interval. We use sparse grids \citep{Heiss_Winschel_2008, pkg:SparseGrid} to approximate this integral. The integrand is defined by products of normal probabilities, which are approximated as described by \cite{Matic_Radoicic_2018}. It is important to note that this likelihood can be computed as accurately as necessary whereas alternative implementations rely on approximations of limited accuracy (at least for non-probit links). The results (model parameters and likelihoods) are very close to those obtained by AGQ (\pkg{lme4}) or \pkg{glmmTMB}, indicating a very good quality the various approximations used. We can also compare the corresponding covariances <>= vcov(toenail_glmer_RI_2) vcov(toenail_mtram_RI)[1:4, 1:4] @ which are also in good agreement. The marginal effects, that is, a marginal binary probit model, are given by the scaled conditional coefficients <>= cf <- coef(toenail_mtram_RI) cf[2:4] / sqrt(1 + cf["gamma1"]^2) @ Such marginal effects can be estimated directly by generalised estimation equations (GEE). For the probit model, three models corresponding to different working correlations can be estimated for example by package \pkg{geepack}: <>= library("geepack") gin <- geeglm(I((0:1)[outcome]) ~ treatment * time, id = patientID, data = toenail, corstr = "independence", family = binomial(link = "probit")) gex <- geeglm(I((0:1)[outcome]) ~ treatment * time, id = patientID, data = toenail, cor = "exchangeable", family = binomial(link = "probit")) gun <- geeglm(I((0:1)[outcome]) ~ treatment * time, id = patientID, data = toenail, cor = "unstructured", family = binomial(link = "probit")) @ The effects are not very close to what we obtained earlier, and it seems the choice of the working correlations matters here: <>= cbind(mtram = cf[2:4] / sqrt(1 + cf["gamma1"]^2), indep = coef(gin)[-1], excha = coef(gex)[-1], unstr = coef(gun)[-1]) @ At least in biostatistics, the probit model is less popular than the logit model owing to the better interpretability of the fixed effects as conditional log-odds ratios in the latter. Thus, we replicate the SAS analysis reported in Chapter~10 of \cite{Molenberghs_Verbeke_2005} <>= gin <- geeglm(I((0:1)[outcome]) ~ treatment * time, id = patientID, data = toenail, corstr = "independence", family = binomial()) gex <- geeglm(I((0:1)[outcome]) ~ treatment * time, id = patientID, data = toenail, cor = "exchangeable", family = binomial()) gun <- geeglm(I((0:1)[outcome]) ~ treatment * time, id = patientID, data = toenail, cor = "unstructured", family = binomial()) @ Again, results are dependent on hyperparameters and also not in very good agreement with SAS output reported by \cite{Molenberghs_Verbeke_2005} <>= coef(gin) coef(gex) coef(gun) @ Alternatively, we can use the transformation approach to compute marginally interpretable time-dependent log-odds ratios from random intercept transformation logit models:% \citep[\code{standardise = TRUE} computes model (M2) instead of the %default (M1), see][]{Hothorn_2019_mtram}: <>= m <- ctm(as.basis(~ outcome, data = toenail), shifting = ~ treatment * time, data = toenail, todistr = "Logistic", negative = TRUE) toenail_logit <- mlt(m, data = toenail, fixed = c("outcomemoderate or severe" = 0)) toenail_mtram_logit <- mtram(toenail_logit, ~ (1 | patientID), data = toenail) @ It is important to note that this model is \emph{not} a logistic mixed-effects model and thus we can't expect to obtain identical results from \cmd{glmer} as it was (partially) the case for the probit model. The marginal log-odds ratios are <>= cf <- coef(toenail_mtram_logit) cf[2:4] / sqrt(1 + cf["gamma1"]^2) @ and an asymptotic confidence interval for the temporal treatment effect can be obtained from a small simulation <>= S <- rmvnorm(10000, mean = coef(toenail_mtram_logit), sigma = vcov(toenail_mtram_logit)) (ci <- quantile(S[,"treatmentterbinafine:time"] / sqrt(1 + S[, "gamma1"]^2), prob = c(.025, .975))) @ The interval indicates a marginally significant treatment effect, that is, an odds ratio for none or mild symptoms of $\Sexpr{round(exp(cf["treatmentterbinafine:time"] / sqrt(1 + cf["gamma1"]^2)), 2)}$ per month, with $95\%$ confidence interval $(\Sexpr{round(exp(ci), 2)})$. A direct comparison of the marginal log-odds ratios with GEE results highlight the discrepancies <>= cbind(mtram = cf[2:4] / sqrt(1 + cf["gamma1"]^2), indep = coef(gin)[-1], excha = coef(gex)[-1], unstr = coef(gun)[-1]) @ Following \cite{Molenberghs_Verbeke_2005}, we use the GEE with unstructured working correlation to compute a confidence interval for the temporal treatment effect on the odds ratio scale <>= exp(coef(gun)["treatmentterbinafine:time"] + c(-1, 1) * qnorm(.975) * sqrt(diag(vcov(gun)))["treatmentterbinafine:time"]) @ In respect of this temporal treatment effect, GEE and marginal transformation models provide similar results, but the ``significance'' of the temporal treatment effect seems to be affected by numerical issues arising when fitting such models to this data. From the marginal transformation model, we can compute and plot marginally interpretable probabilities and odds ratios over time <>= tmp <- toenail_logit cf <- coef(tmp) cf <- cf[names(cf) != "outcomemoderate or severe"] sdrf <- rev(coef(toenail_mtram_logit))[1] cf <- coef(toenail_mtram_logit)[names(cf)] / sqrt(sdrf^2 + 1) cf <- c(cf[1], "outcomemoderate or severe" = 0, cf[-1]) coef(tmp) <- cf time <- 0:180/10 treatment <- sort(unique(toenail$treatment)) nd <- expand.grid(time = time, treatment = treatment) nd$prob_logit <- predict(tmp, newdata = nd, type = "distribution")[1,] nd$odds <- exp(predict(tmp, newdata = nd, type = "trafo")[1,]) @ We can also sample from the distribution of the maximum likelihood estimators to obtain an idea about the uncertainty (Figure~\ref{fig:toenailOR}). \begin{figure}[t] <>= X <- model.matrix(~ treatment * time, data = nd) rbeta <- rmvnorm(10000, mean = coef(toenail_mtram_logit), sigma = vcov(toenail_mtram_logit)) s <- rbeta[,ncol(rbeta)] rbeta <- rbeta[,-ncol(rbeta)] / sqrt(s^2 + 1) odds <- exp(-X %*% t(rbeta)) OR <- odds[1:length(time),] / odds[-(1:length(time)),] plot(time, rep(0, length(time)), ylim = range(OR), type = "n", xlab = "Time", ylab = "Odds ratio") colgrey <- rgb(.1, .1, .1, .01) out <- apply(OR, 2, function(x) lines(time, x, col = colgrey)) ORest <- nd$odds[1:length(time)] / nd$odds[-(1:length(time))] lines(time, ORest, col = col[1], lwd = 2) @ \caption{Toe nail data: Marginal odds ratio over time (from a logistic random intercept model). The blue line represents the maximum likelihood estimator, the grey lines are samples from the corresponding distribution. \label{fig:toenailOR}} \end{figure} From the logit and probit models, we can also obtain marginally interpretable probabilities as (probit) <>= tmp <- toenail_logit cf <- coef(tmp) cf <- cf[names(cf) != "outcomemoderate or severe"] sdrf <- rev(coef(toenail_mtram_logit))[1] cf <- coef(toenail_mtram_logit)[names(cf)] cf <- c(cf[1], "outcomemoderate or severe" = 0, cf[-1]) coef(tmp) <- cf pr <- predict(tmp, newdata = nd, type = "distribution")[1,] nd$prob_logit <- pnorm(qnorm(pr) / sdrf) @ and (logit) <>= tmp <- toenail_probit cf <- coef(tmp) cf <- cf[names(cf) != "outcomemoderate or severe"] sdrf <- rev(coef(toenail_mtram_RI))[1] cf <- coef(toenail_mtram_RI)[names(cf)] / sqrt(sdrf^2 + 1) cf <- c(cf[1], "outcomemoderate or severe" = 0, cf[-1]) coef(tmp) <- cf nd$prob_probit <- predict(tmp, newdata = nd, type = "distribution")[1,] @ The marginal time-dependent probabilities obtained from all three models are very similar as shown in Figure~\ref{fig:toenailprob}. \begin{figure}[t] <>= nd2 <- nd[rep(1:nrow(nd), 2),] nd2$prob <- c(nd$prob_probit, nd$prob_logit) lev <- c("Probit", "Logit") nd2$model <- rep(factor(lev, labels = lev, levels = lev), each = nrow(nd)) xyplot(prob ~ time | model, data = nd2, group = treatment, ylim = c(0, 1), xlab = "Time", par.settings = simpleTheme(col = col), auto.key = list(text = levels(nd2$treatment), points = FALSE, lines = TRUE, space = "top"), col = col, type = "l", ylab = "Probability (none or mild)") @ \caption{Toe nail data: Comparison of marginal probabilities obtained from a probit linear mixed-effects model and a logistic transformation model with marginal log-odds ratio treatment effect. % two logistic transformation % models (M2: with or M1: without marginal log-odds ratio treatment % effect). \label{fig:toenailprob}} \end{figure} \subsection{Random Intercept / Random Slope Models} Things get a bit less straightforward when a random slope is added to the model. We switch back to the probit link allowing comparison of our implementation with other packages. Some implementations do not allow clusters consisting of a single observation, so we remove patients without follow-up <>= (rlev <- levels(toenail$patientID)[xtabs(~ patientID, data = toenail) == 1]) toenail_gr1 <- subset(toenail, !patientID %in% rlev) toenail_gr1$patientID <- toenail_gr1$patientID[, drop = TRUE] @ The two implementations of the Laplace approximation in packages \pkg{lme4} <>= toenail_glmer_RS <- glmer(outcome ~ treatment * time + (1 + time | patientID), data = toenail_gr1, family = binomial(link = "probit")) summary(toenail_glmer_RS) toenail_glmer_RS@theta @ and \pkg{glmmTMB} <>= toenail_glmmTMB_RS_1 <- glmmTMB(outcome ~ treatment * time + (1 + time | patientID), data = toenail_gr1, family = binomial(link = "probit")) summary(toenail_glmmTMB_RS_1) @ are in good agreement. The optimisation of the exact discrete likelihood in the transformation framework gives <>= m <- ctm(as.basis(~ outcome, data = toenail_gr1), shifting = ~ treatment * time, data = toenail, todistr = "Normal", negative = TRUE) toenail_probit <- mlt(m, data = toenail_gr1, fixed = c("outcomemoderate or severe" = 0)) toenail_mtram_RS <- mtram(toenail_probit, ~ (1 + time | patientID), data = toenail_gr1) logLik(toenail_mtram_RS) coef(toenail_mtram_RS) @ Here, substantial differences for all parameters can be observed. Because the parameters have the same meaning in all three implementations, we can compare the three models in light of the exact discrete log-likelihood \citep[Equation~6 in][]{Hothorn_2019_mtram} evaluated at these parameters. The results are given in Table~\ref{tab:toenail}. For the random intercept models, AGQ, Laplace, and the discrete log-likelihood give the same results, the Laplace approximation seemed to fail. It was not possible to apply the AGQ approach to the random intercept / random slope model. The two implementations of the Laplace approximation in packages \pkg{lme4} and \pkg{glmmTMB} differed for the random intercept model but agreed for the random intercept / random slope model. The log-likelihood obtained by direct maximisation of (7) resulted in the best fitting model with the least extreme parameter estimates. Computing times for all procedures were comparable. \begin{table} \begin{center} %%%% coefs, logLiks, and timings in table <>= t1 <- system.time(toenail_glmer_RI_1 <- glmer(outcome ~ treatment * time + (1 | patientID), data = toenail, family = binomial(link = "probit"), nAGQ = 1)) t2 <- system.time(toenail_glmer_RI_2 <- glmer(outcome ~ treatment * time + (1 | patientID), data = toenail, family = binomial(link = "probit"), nAGQ = 20)) t3 <- system.time(toenail_glmmTMB_RI_3 <- glmmTMB(outcome ~ treatment * time + (1 | patientID), data = toenail, family = binomial(link = "probit"))) m <- ctm(as.basis(~ outcome, data = toenail), shifting = ~ treatment * time, data = toenail, todistr = "Normal", negative = TRUE) toenail_probit <- mlt(m, data = toenail, fixed = c("outcomemoderate or severe" = 0)) t4 <- system.time(toenail_mtram_RI <- mtram(toenail_probit, ~ (1 | patientID), data = toenail)) t5 <- system.time(toenail_glmer_RS <- glmer(outcome ~ treatment * time + (1 + time | patientID), data = toenail_gr1, family = binomial(link = "probit"))) t6 <- system.time(toenail_glmmTMB_RS_1 <- glmmTMB(outcome ~ treatment * time + (1 + time | patientID), data = toenail_gr1, family = binomial(link = "probit"))) m <- ctm(as.basis(~ outcome, data = toenail_gr1), shifting = ~ treatment * time, data = toenail, todistr = "Normal", negative = TRUE) toenail_probit <- mlt(m, data = toenail_gr1, fixed = c("outcomemoderate or severe" = 0)) t7 <- system.time(toenail_mtram_RS <- mtram(toenail_probit, ~ (1 + time | patientID), data = toenail_gr1)) ## ----output, echo = FALSE------------------------------------------------ tn_RI_glmer_L <- c(fixef(toenail_glmer_RI_1), toenail_glmer_RI_1@theta, 0, 0) tn_RI_glmer_A <- c(fixef(toenail_glmer_RI_2), toenail_glmer_RI_2@theta, 0, 0) tn_RI_glmmTMB <- c(fixef(toenail_glmmTMB_RI_3)$cond, sqrt(VarCorr(toenail_glmmTMB_RI_3)$cond$patientID), 0, 0) tn_RI_mlt <- c(coef(toenail_mtram_RI), 0, 0) tn_RS_glmer <- c(fixef(toenail_glmer_RS), toenail_glmer_RS@theta) tn_RS_glmmTMB <- c(fixef(toenail_glmer_RS), chol(VarCorr(toenail_glmmTMB_RS_1)$cond$patientID)[c(1,3, 4)]) tn_RS_mlt <- coef(toenail_mtram_RS) tn <- cbind(tn_RI_glmer_L, tn_RI_glmer_A , tn_RI_glmmTMB, tn_RI_mlt , tn_RS_glmer, tn_RS_glmmTMB, tn_RS_mlt) logLik(toenail_glmer_RI_1) logLik(toenail_glmer_RI_2) logLik(toenail_glmmTMB_RI_3) logLik(toenail_mtram_RI) logLik(toenail_glmer_RS) logLik(toenail_glmmTMB_RS_1) logLik(toenail_mtram_RS) ll <- c( ### logLik of transformation model for glmer (Laplace) parameters logLik(toenail_mtram_RI, tn_RI_glmer_L[1:5] * c(-1, 1, 1, 1, 1)), ### logLik of transformation model for glmer (AGQ) parameters logLik(toenail_mtram_RI, tn_RI_glmer_A[1:5] * c(-1, 1, 1, 1, 1)), ### logLik of transformation model for glmmTMB (Laplace) parameters logLik(toenail_mtram_RI, tn_RI_glmmTMB[1:5] * c(-1, 1, 1, 1, 1)), ### logLik of transformation model logLik(toenail_mtram_RI), ### logLik of transformation model for glmer (Laplace) parameters logLik(toenail_mtram_RS, tn_RS_glmer * c(-1, rep(1, 6))), ### logLik of transformation model for glmmTMB (Laplace) parameters logLik(toenail_mtram_RS, tn_RS_glmmTMB * c(-1, rep(1, 6))), ### logLik of transformation model logLik(toenail_mtram_RS)) tm <- c(t1["user.self"], t2["user.self"], t3["user.self"], t4["user.self"], t5["user.self"], t6["user.self"], t7["user.self"]) tm <- formatC(tm, format = "f", digits = 2, width = 5) tn <- formatC(tn, format = "f", digits = 2, width = 5) ll <- formatC(ll, format = "f", digits = 2, width = 6) tn <- cbind(c("$\\alpha$", "$\\eshiftparm_1$", "$\\eshiftparm_2$", "$\\eshiftparm_3$", "$\\gamma_1$", "$\\gamma_2$", "$\\gamma_3$"), tn) ret <- c(" \\begin{tabular}{lrrrr|rrr} \\\\ \\hline & \\multicolumn{4}{c|}{RI} & \\multicolumn{3}{c}{RI + RS} \\\\ & \\texttt{glmer} & \\texttt{glmer} & \\texttt{glmmTMB} & & \\texttt{glmer} & \\texttt{glmmTMB} & \\\\ & L & AGQ & L & (7) & L & L & (7) \\\\ \\hline") ret <- c(ret, apply(tn, 1, function(x) c(paste(x, collapse = " & "), "\\\\"))) ret <- c(ret, "\\hline") ret <- c(ret, paste("LogLik &", paste(ll, collapse = "&"), "\\\\ "), paste("Time (sec) &", paste(tm, collapse = "&"), "\\\\ \\hline"), "\\end{tabular}") @ <>= cat(ret, sep = "\n") @ \caption{Toe nail data. Binary probit models featuring fixed intercepts $\alpha$, treatment effects $\eshiftparm_1$, time effects $\eshiftparm_2$, and time-treatment interactions $\eshiftparm_3$ are compared. Random intercept (RI) and random intercept/random slope (RI + RS) models were estimated by the Laplace (L) and Adaptive Gauss-Hermite Quadrature (AGQ) approximations to the likelihood (implemented in packages \pkg{lme4} and \pkg{glmmTMB}). In addition, the exact discrete log-likelihood (7) was used for model fitting and evaluation (the in-sample log-likelihood (7) for all models and timings of all procedures are given in the last two lines). \label{tab:toenail} } \end{center} \end{table} \section{Proportional Odds Models for Bounded Responses} \label{sec:logit} \cite{manuguerra_heller_2010} proposed a mixed-effects model for bounded responses whose fixed effects can be interpreted as log-odds ratios. We fit a transformation model to data from a randomised controlled trial on chronic neck pain treatment \citep{chow_heller_2006}. The data are visualised in Figure~\ref{fig:neck_pain}. Subjective neck pain levels were assessed on a visual analog scale, that is, on a bounded interval. \begin{figure}[t] <>= data("neck_pain", package = "ordinalCont") pain_df <- neck_pain idt <- xtabs(~ id, data = pain_df) miss <- names(idt)[idt < 3] pain_df <- subset(pain_df, !id %in% miss) pain_df$id <- factor(pain_df$id) levels(pain_df$laser) <- c("Placebo", "Active") levels(pain_df$time) <- c("Baseline", "7 weeks", "12 weeks") pain <- rbind(subset(pain_df, laser == levels(pain_df$laser)[1]), subset(pain_df, laser == levels(pain_df$laser)[2])) p1 <- xyplot(vas ~ time | laser, data = pain, groups = id, type = "l", col = rgb(.1, .1, .1, .1), lwd = 2, layout = c(2, 1), ylab = "Neck pain (on visual analog scale)", xlab = "Examinations") plot(p1) @ \caption{Neck pain: Trajectories of neck pain assessed on a visual analog scale with and without low-level laser therapy. \label{fig:neck_pain}} \end{figure} \cite{manuguerra_heller_2010} suggested the conditional model \begin{eqnarray*} & & \logit(\Prob(\text{pain} \le \ry \mid \text{treatment}, \text{time}, i)) = \\ & & \quad \h(\ry) + \eshiftparm_\text{Active} + \eshiftparm_\text{7 weeks} + \eshiftparm_\text{12 weeks} + \eshiftparm_\text{7 weeks, Active} + \eshiftparm_\text{12 weeks, Active} + \alpha_i \end{eqnarray*} with random intercepts $\tilde{\alpha}_i$ such that the odds at baseline, for example, are given by \begin{eqnarray*} \frac{\Prob(\text{pain} \le \ry \mid \text{Active}, \text{baseline}, i)} {\Prob(\text{pain} > \ry \mid \text{Active}, \text{baseline}, i)} = \exp(\eshiftparm_\text{Active}) \frac{\Prob(\text{pain} \le \ry \mid \text{Placebo}, \text{baseline}, i)} {\Prob(\text{pain} > \ry \mid \text{Placebo}, \text{baseline}, i)} \end{eqnarray*} <>= library("ordinalCont") @ <>= neck_ocm <- ocm(vas ~ laser * time + (1 | id), data = pain_df, scale = c(0, 1)) @ The results <>= summary(neck_ocm) @ suggest that there is a difference at baseline; the pain distribution of subjects in the placebo group on the odds scale is only $\Sexpr{round(exp(coef(neck_ocm)[2]) * 100, 1)}\%$ of the odds in the active group for any cut-off $\ry$: <>= exp(cbind(coef(neck_ocm)[2:6], confint(neck_ocm)[2:6,])) @ In contrast, there seems to be a very large treatment effect (at week 7, the odds in the placebo group is $\Sexpr{round(exp(coef(neck_ocm)[3]), 2)}$ times larger than in the active group. This levels off after 12 weeks, but the effect is still significant at the $5\%$ level. For comparison, we can fit a conditional mixed-effects transformation model with a different parametrisation of the transformation function $\h$ using a Laplace approximation of the likelihood \citep{Tamasi_Crowther_Puhan_2022}: <>= library("tramME") neck_ColrME <- ColrME(vas ~ laser * time + (1 | id), data = pain_df, bounds = c(0, 1), support = c(0, 1)) @ and coefficients <>= exp(coef(neck_ColrME)) @ The model is the same as \code{neck\_ocm}, but the parameter estimates for log-odds ratios differ quite substantially due to an alternative parameterisation of $\h$ and due to different estimation procedures being applied. Our marginally interpretable transformation model with the same transformation function as the model \code{neck\_ColrME} but with a completely different model formulation and optimisation procedure for maximising the log-likelihood, can be estimated by <>= neck_Colr <- Colr(vas ~ laser * time, data = pain_df, bounds = c(0, 1), support = c(0, 1), extrapolate = TRUE) neck_Colrmer <- mtram(neck_Colr, ~ (1 | id), data = pain_df) @ Based on this model, it is possible to derive the marginal distribution functions in the two groups, see Figure~\ref{fig:distr_pain}. \begin{figure}[t] <>= nd <- expand.grid(laser = unique(pain_df$laser), time = unique(pain_df$time)) tmp <- as.mlt(neck_Colr) coef(tmp)[] <- coef(neck_Colrmer)[1:12] q <- 1:99/100 nd2 <- expand.grid(vas = q, laser = unique(pain_df$laser), time = unique(pain_df$time)) # tmp <- as.mlt(neck_Colr) sd <- sqrt(coef(neck_Colrmer)[13]^2 + 1) prb <- predict(tmp, newdata = nd, type = "distribution", q = q) nd2$prob <- c(pnorm(qnorm(prb) / sd)) p2 <- xyplot(prob ~ vas | time, data = nd2, groups = laser, type = "l", col = col, layout = c(3, 1), xlab = "Neck pain (on visual analog scale)", ylab = "Marginal distribution", par.settings = simpleTheme(col=col), auto.key = list(text = levels(nd2$laser), points = FALSE, lines = TRUE, space = "top")) plot(p2) ## M1 # neck_Colrmer <- mtram(neck_Colr, ~ (1 | id), data = pain_df, # Hessian = TRUE, standardise = FALSE) # logLik(neck_Colrmer) # # nd <- expand.grid(laser = unique(pain_df$laser), # time = unique(pain_df$time)) # q <- 1:99/100 # nd2 <- expand.grid(vas = q, laser = unique(pain_df$laser), # time = unique(pain_df$time)) # tmp <- as.mlt(neck_Colr) # coef(tmp)[] <- coef(neck_Colrmer)[1:12] # sd <- sqrt(coef(neck_Colrmer)[13]^2 + 1) # prb <- predict(tmp, newdata = nd, type = "distribution", q = q) # nd2$prob <- c(pnorm(qnorm(prb) / sd)) # p2 <- xyplot(prob ~ vas | time, data = nd2, groups = laser, type = "l", # col = col, ylim = c(-0.05, 1.05), # layout = c(3, 1), # xlab = "Neck pain (on visual analog scale)", # ylab = "Marginal distribution", # par.settings = simpleTheme(col=col), # auto.key = list(text = levels(nd2$laser), # points = FALSE, lines = TRUE, space = "top")) # plot(p2) @ \caption{Neck pain: Marginal distribution functions of chronic neck pain evaluated at three different time points under placebo or active low-level laser therapy. \label{fig:distr_pain}} \end{figure} We sample from the joint normal distribution of the maximum likelihood estimators $\hat{\eparm}_1, \dots, \hat{\eparm}_7$, $\hat{\eshiftparm}_\text{Active}, \hat{\eshiftparm}_\text{7 weeks}, \hat{\eshiftparm}_\text{12 weeks}, \hat{\eshiftparm}_\text{7 weeks, Active}, \hat{\eshiftparm}_\text{12 weeks, Active}, \hat{\alpha}_i$ and compute confidence intervals for the marginal treatment effect after 7 and 12 weeks <>= S <- vcov(neck_Colrmer) rbeta <- rmvnorm(10000, mean = coef(neck_Colrmer), sigma = S) s <- rbeta[, ncol(rbeta)] rbeta <- rbeta[,-ncol(rbeta)] / sqrt(s^2 + 1) t(apply(rbeta[, 8:12], 2, function(x) { quantile(exp(x),prob = c(.025, .5, .975))})) @ Because the model \code{neck_Colrmer} has a marginal interpretation, we can derive the marginal probabilistic index and corresponding confidence intervals for the three time points as follows. In this case, the marginal probabilistic index obtained from model \code{neck\_Colrmer} is the probability that, for a randomly selected patient in the treatment group, the neck pain score at time $t$ is higher than the score for a subject in the placebo group randomly selected at the same time point. There are two possible ways to compute the marginal probabilistic index. First, we consider the standardised version of the marginal treatment effects, that is: <>= beta <- coef(neck_Colrmer)[8:12] alpha <- coef(neck_Colrmer)[13] (std_beta <- cbind(beta / sqrt(1 + alpha^2))) @ Then we compute the marginal treatment effect for weeks $0, 7, 12$ by multiplying the shift vector with the following contrast matrix <>= ctr_mat <- matrix(c(1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), nrow = 3, byrow = TRUE) ctr_mat %*% std_beta @ We simulate from the asymptotic distribution of the parameters to obtain an empirical 95\% confidence interval and pass it to the \code{PI} function by specifying the correct link function <>= (ci_emp <- t(apply(ctr_mat %*% t(rbeta[, 8:12]), 1, function(x) { quantile(x, prob = c(.025, .5, .975))}))) PI(-ci_emp, link = "logistic") @ Alternatively, we can compute the probabilistic index by passing a \code{Colr} model to the \code{PI} function. However, we have to make sure that the marginal model has the correct coefficients as obtained by standardising the coefficients from the \code{mtram} model: <>= nd <- expand.grid(time = unique(pain_df$time), laser = unique(pain_df$laser)) neck_Colr_marg <- neck_Colr neck_Colr_marg$coef <- coef(neck_Colrmer)[1:12] / sqrt(coef(neck_Colrmer)[13]^2 + 1) (neck_Colr_PI <- PI(neck_Colr_marg, newdata = nd[1:3, ], reference = nd[4:6, ], one2one = TRUE, conf.level = .95))[1:3, 1:3] @ At baseline, we obtain a probabilistic index of $\Sexpr{round(neck_Colr_PI[1, 1], 2)}$. After 7 weeks, its value is $\Sexpr{round(neck_Colr_PI[2, 1], 2)}$ and after 12 weeks $\Sexpr{round(neck_Colr_PI[3, 1], 2)}$. These values reflect the effect of the low-level laser therapy for patients in the treatment group. Of course, the confidence intervals for the estimates of the probabilistic index differ slightly across the two methods, but the point estimates coincide. \section{Marginally Interpretable Weibull and Cox Models} The CAO/ARO/AIO-04 randomised clinical trial \citep{Roedel_Graeven_Fietkau_2015} compared Oxaliplatin added to fluorouracil-based preoperative chemoradiotherapy and postoperative chemotherapy to the same therapy using fluorouracil only for rectal cancer patients. Patients were randomised in the two treatment arms by block randomisation taking the study center, the lymph node involvement (negative, positive), and tumour grading (T1-3 vs.~T4) into account. The primary endpoint was disease-free survival, defined as the time between randomisation and non-radical surgery of the primary tumour (R2 resection), locoregional recurrence after R0/1 resection, metastatic disease or progression, or death from any cause, whichever occurred first. The observed outcomes are a mix of exact dates (time to death or incomplete removal of the primary tumour), right-censoring (end of follow-up or drop-out), and interval-censoring (local or distant metastases). We are interested in a clustered Cox or Weibull model for interval-censored survival times. The survivor functions, estimated separately for each of the four strata defined by lymph node involvement and tumour grading, are given in Figure~\ref{fig:CAO}. <>= dir <- system.file("rda", package = "TH.data") load(file.path(dir, "Primary_endpoint_data.rda")) @ \begin{figure}[t] <>= ra <- sort(unique(CAOsurv$randarm)) st <- sort(unique(CAOsurv$strat_t)) sn <- sort(unique(CAOsurv$strat_n)) su <- c(1, 1700) add <- c(0, max(CAOsurv$iDFS[, "time2"]) - su[2]) ylim <- c(-.05, 1.05) tmp <- as.mlt(Coxph(iDFS | 0 + strat_n:strat_t:randarm ~ 1, data = CAOsurv, support = su, add = add, log_first = TRUE)) nd <- expand.grid(strat_n = sn, strat_t = st, randarm = ra) q <- mkgrid(tmp, 100)[[1]] surv <- predict(tmp, newdata = nd, type = "survivor", q = q) nd <- nd[rep(1:nrow(nd), each = nrow(surv)),] nd$time <- q nd$surv <- c(surv) xyplot(surv ~ time | strat_t + strat_n, data = nd, groups = randarm, type = "l", ylim = c(0, 1), col = col, ylab = "Probability", xlab = "Time (in days)", par.settings = simpleTheme(col=col), auto.key = list(text = levels(nd$randarm), points = FALSE, lines = TRUE, space = "top")) @ \caption{Rectal cancer: Distribution of disease-free survival times for two treatments in the four strata defined by lymph node involvement (negative or positive) and tumor grading (T1-3 or T4). \label{fig:CAO}} \end{figure} The implementation of marginally interpretable linear transformation models is currently not able to deal with mixed exact and censored outcomes in the same cluster. We therefore recode exact event times as being interval-censored by adding a 4-day window to each exact event time (variable \code{iDFS2}). <>= ### convert "exact" event dates to interval-censoring (+/- one day) tmp <- CAOsurv$iDFS exact <- tmp[,3] == 1 tmp[exact,2] <- tmp[exact,1] + 2 tmp[exact,1] <- pmax(tmp[exact,1] - 2, 0) tmp[exact,3] <- 3 CAOsurv$iDFS2 <- tmp @ We start with the random intercept model \begin{eqnarray*} \Prob(\rY > \ry \mid \text{treatment}) = \exp\left(-\exp\left(\frac{\eparm_1 + \eparm_2 \log(\ry) - \eshiftparm_\text{5-FU + Ox}}{\sqrt{\gamma_1^2 + 1}}\right)\right) \end{eqnarray*} assuming a marginal Weibull model whose effects are scaled depending on the variance $\gamma_1^2$ of a block-specific (interaction of lymph node involvement, tumor grading, and study center) random intercept: <>= CAO_SR <- Survreg(iDFS2 ~ randarm, data = CAOsurv) CAO_SR_mtram <- mtram(CAO_SR, ~ (1 | Block), data = CAOsurv) logLik(CAO_SR_mtram) (cf <- coef(CAO_SR_mtram)) (OR <- exp(-cf["randarm5-FU + Oxaliplatin"] / sqrt(cf["gamma1"]^2 + 1))) @ We are, of course, interested in the marginal treatment effect, that is, the hazards ratio % \begin{eqnarray*} \exp\left(-\eshiftparm_\text{5-FU + Ox} / \sqrt{\gamma_1^2 + 1}\right). \end{eqnarray*} % We simply sample from the joint normal distribution of the maximum likelihood estimators $\hat{\eparm}_1, \hat{\eparm}_2, \hat{\eshiftparm}_\text{5-FU + Ox}, \hat{\gamma}_1$ and compute confidence intervals for the marginal treatment effect $\Sexpr{round(OR, 2)}$ as <>= S <- vcov(CAO_SR_mtram) # sqrt(diag(S)) rbeta <- rmvnorm(10000, mean = coef(CAO_SR_mtram), sigma = S) s <- rbeta[, ncol(rbeta)] rbeta <- rbeta[, -ncol(rbeta)] / sqrt(s^2 + 1) quantile(exp(-rbeta[, ncol(rbeta)]), prob = c(.025, .5, .975)) @ In a next step, we stratify with respect to lymph node involvement and tumor grading: For each of the four strata, the parameters $\eparm_1$ and $\eparm_2$ are estimated separately: <>= CAO_SR_2 <- Survreg(iDFS2 | 0 + strat_n:strat_t ~ randarm, data = CAOsurv) CAO_SR_2_mtram <- mtram(CAO_SR_2, ~ (1 | Block), data = CAOsurv) logLik(CAO_SR_2_mtram) (cf <- coef(CAO_SR_2_mtram)) (OR_2 <- exp(-cf["randarm5-FU + Oxaliplatin"] / sqrt(cf["gamma1"]^2 + 1))) @ The corresponding confidence interval for the marginal treatment effect is then <>= S <- vcov(CAO_SR_2_mtram) rbeta <- rmvnorm(10000, mean = coef(CAO_SR_2_mtram), sigma = S) s <- rbeta[, ncol(rbeta)] rbeta <- rbeta[, -ncol(rbeta)] / sqrt(s^2 + 1) quantile(exp(-rbeta[, ncol(rbeta)]), prob = c(.025, .5, .975)) @ We now relax the Weibull assumption in the Cox model \begin{eqnarray*} \Prob(\rY > \ry \mid \text{treatment}) = \exp\left(-\exp\left(\frac{\basisy(\log(\ry))^\top \parm + \eshiftparm_\text{5-FU + Ox}}{\sqrt{\gamma_1^2 + 1}}\right)\right) \end{eqnarray*} (note the positive sign of the treatment effect). <>= CAO_Cox_2 <- Coxph(iDFS2 | 0 + strat_n:strat_t ~ randarm, data = CAOsurv, support = c(1, 1700), log_first = TRUE, order = 4) logLik(CAO_Cox_2) CAO_Cox_2_mtram <- mtram(CAO_Cox_2, ~ (1 | Block), data = CAOsurv) logLik(CAO_Cox_2_mtram) coef(CAO_Cox_2_mtram) @ with confidence interval <>= S <- vcov(CAO_Cox_2_mtram) rbeta <- rmvnorm(10000, mean = coef(CAO_Cox_2_mtram), sigma = S) s <- rbeta[,ncol(rbeta)] rbeta <- rbeta[,-ncol(rbeta)] / sqrt(s^2 + 1) quantile(exp(rbeta[, ncol(rbeta)]), prob = c(.025, .5, .975)) @ For the marginally interpretable models that can be derived from model \code{CAO\_Cox\_2\_mtram} we can compute the probabilistic index. This value is the meaning that over all study centers, a randomly selected patient receiving Oxaliplatin has a $56\%$ probability of staying disease-free longer than a randomly selected patient receiving the standard treatment only, given that they both have the same lymph node state and tumor grading. <>= nd <- CAOsurv[1:2, ] tmp <- CAO_Cox_2 tmp$coef <- coef(CAO_Cox_2_mtram)[-22] / sqrt(coef(CAO_Cox_2_mtram)[22]^2 + 1) (CAO_Cox_PI <- PI(tmp, newdata = nd[2, ], reference = nd[1, ], one2one = TRUE, conf.level = .95))[1, ] @ but we can compute the same manually as follows: <>= ci_man <- quantile(-rbeta[, ncol(rbeta)], prob = c(.025, .5, .975)) (CAO_Cox_PIm <- PI(ci_man, link = "minimum extreme value")) @ We can fit mixed-effects transformation models \citep{tamasi2021tramme,Tamasi_Crowther_Puhan_2022} as follows: <>= CAO_Cox_2_tramME <- CoxphME(iDFS2 | 0 + strat_n:strat_t ~ randarm + (1 | Block), data = CAOsurv, log_first = TRUE) @ From this conditional model, we can obtain the conditional hazard ratio with confidence interval: <>= exp(coef(CAO_Cox_2_tramME)) exp(confint(CAO_Cox_2_tramME, parm = "randarm5-FU + Oxaliplatin", estimate = TRUE)) @ which is similar to the one of the marginally interpretable model. % For both the mixed-effects transformation model and the marginally interpretable % transformation model, the estimated variance parameter $\gamma_1$ is not very large, % meaning that <>= sqrt(VarCorr(CAO_Cox_2_tramME)$Block$var) coef(CAO_Cox_2_mtram)["gamma1"] @ % Because the estimated variance parameter $\gamma_1$ is not very large, we % would expect to see similar results in a conditional Cox model with normal % frailty term <>= library("coxme") m <- coxme(DFS ~ randarm + (1 | Block), data = CAOsurv) summary(m) sd <- sqrt(diag(vcov(m))) exp(coef(m) + c(-1, 0, 1) * qnorm(.975) * sd) @ \bibliography{mlt,packages} \newpage \appendix \section{Simulations} Empirical results presented in Section 4 of \cite{Hothorn_2019_mtram} can be reproduced using <>= source(system.file("simulations", "mtram_sim.R", package = "tram"), echo = TRUE) @ (this takes quite some time). \section{Change Log} \begin{description} \item[1.2-0] \pkg{tram} utilises novel log-likelihood (\code{mvtnorm::lpRR()}) and score functions (\code{mvtnorm::slpRR()}) for reduced rank problems introduced in \pkg{mvtnorm} 1.3-2. The default optimiser is now \code{auglag::auglag()} with numerically approximated Hessians. Score functions are now available for both continuous and discrete response (but there is little hope that score functions for a mix of continuous and discrete responses will become available). Model fit is now faster and more accurate for models with discrete responses. Some parameter estimates and standard errors changed slightly, however, model interpretation was not affected. \end{description} \newpage <>= sessionInfo() @ <>= if (file.exists("packages.bib")) file.remove("packages.bib") pkgversion <- function(pkg) { pkgbib(pkg) packageDescription(pkg)$Version } pkgbib <- function(pkg) { x <- citation(package = pkg, auto = TRUE)[[1]] b <- toBibtex(x) b <- gsub("Buehlmann", "B{\\\\\"u}hlmann", b) b[1] <- paste("@Manual{pkg:", pkg, ",", sep = "") if (is.na(b["url"])) { b[length(b)] <- paste(" URL = {http://CRAN.R-project.org/package=", pkg, "}", sep = "") b <- c(b, "}") } cat(b, sep = "\n", file = "packages.bib", append = TRUE) } pkg <- function(pkg) { vrs <- try(pkgversion(pkg)) if (inherits(vrs, "try-error")) return(NA) paste("\\\\pkg{", pkg, "} \\\\citep[version~", vrs, ",][]{pkg:", pkg, "}", sep = "") } pkg("mlt") pkg("tram") pkg("SparseGrid") cat(c("@Manual{vign:mlt.docreg,", " title = {Most Likely Transformations: The mlt Package},", " author = {Torsten Hothorn},", paste(" year = ", substr(packageDescription("mlt.docreg")$Date, 1, 4), ",", sep = ""), paste(" note = {R package vignette version ", packageDescription("mlt.docreg")$Version, "},", sep = ""), " url = {https://CRAN.R-project.org/package=mlt.docreg},", "}"), file = "packages.bib", append = TRUE, sep = "\n") @ \end{document}