%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{Marginally Interpretable Linear Transformation Models}
%\VignetteDepends{variables, basefun, mlt, tram, survival, lme4, gridExtra, lattice, latticeExtra, colorspace, HSAUR3, mvtnorm, ordinalCont, tramME, glmmTMB, geepack}

\documentclass[article,nojss,shortnames]{jss}

%% packages
\usepackage{thumbpdf}
\usepackage{amsfonts,amstext,amsmath,amssymb,amsthm}
\usepackage{accents}
\usepackage{color}
\usepackage{rotating}
\usepackage{verbatim}
\usepackage[utf8]{inputenc}
%% need no \usepackage{Sweave.sty}
%%\usepackage[nolists]{endfloat}

\newcommand{\cmd}[1]{\texttt{#1()}}


\usepackage{tikz}
\usetikzlibrary{shapes,arrows,chains}
\usepackage{verbatim}

<<mtram-pkgs, echo = FALSE, results = "hide", message = FALSE, warning = FALSE>>=
set.seed(290875)

pkgs <- c("colorspace", "survival", "lme4", "tram", "gridExtra",
          "lattice", "latticeExtra", "mvtnorm", "ordinalCont", "tramME")
pkgs <- sapply(pkgs, require, character.only = TRUE)
@

\newcommand{\TODO}[1]{{\color{red} #1}}

\newcommand\Torsten[1]{{\color{blue}Torsten: ``#1''}}

\newcommand{\THcite}[2]{\citeauthor{#2} (\citeyear{#2})}


\newcommand\norm[1]{\left\lVert#1\right\rVert}

\newcommand{\CTM}{CTM Likelihood Boosting}
\newcommand{\STM}{STM Likelihood Boosting}

\newcommand{\etc}{\textit{etc.}}

\usepackage{booktabs}

\newcommand{\expit}{\text{expit}}

%%% mlt
%% rv
\newcommand{\rZ}{Z}
\newcommand{\rY}{Y}
\newcommand{\rX}{\mX}
\newcommand{\rz}{z}
\newcommand{\ry}{y}
\newcommand{\rx}{\xvec}
\newcommand{\ru}{\uvec}
\newcommand{\erx}{x}
%% sigma algebra
\newcommand{\sA}{\mathfrak{A}}
\newcommand{\sAZ}{\mathfrak{B}}
\newcommand{\sAY}{\mathfrak{C}}
\newcommand{\esA}{A}
\newcommand{\esAZ}{B}
\newcommand{\esAY}{C}
%% sample spaces
\newcommand{\sam}{\Omega}
\newcommand{\samZ}{\RR}
\newcommand{\samY}{\Xi}
\newcommand{\samX}{\chi}
%% measureable spaces
\newcommand{\ms}{(\sam, \sA)}
\newcommand{\msZ}{(\samZ, \sAZ)}
\newcommand{\msY}{(\samY, \sAY)}
%% probability spaces
\newcommand{\ps}{(\sam, \sA, \Prob)}
\newcommand{\psZ}{(\samZ, \sAZ, \Prob_\rZ)}
\newcommand{\psY}{(\samY, \sAY, \Prob_\rY)}
%% distributions
\newcommand{\pZ}{F}
\newcommand{\pY}{F_\rY}
\newcommand{\oY}{O_\rY}
\newcommand{\oYx}{O_{\rY \mid \rX = \rx}}
\newcommand{\hatpY}{\hat{F}_{\rY,N}}
\newcommand{\hatpYx}{\hat{F}_{\rY \mid \rX = \rx, N}}
\newcommand{\PYx}{\Prob_{\rY \mid \rX = \rx}}
\newcommand{\PYX}{\Prob_{\rY, \rX}}
\newcommand{\PY}{\Prob_{\rY}}
\newcommand{\pN}{\Phi}
\newcommand{\pSL}{F_{\SL}}
\newcommand{\pMEV}{F_{\MEV}}
\newcommand{\pExp}{F_{\ExpD}}
\newcommand{\pSW}{F_{\SW}}
\newcommand{\pYx}{F_{\rY \mid \rX = \rx}}
\newcommand{\pYA}{F_{\rY \mid \rX = A}}
\newcommand{\pYB}{F_{\rY \mid \rX = B}}
\newcommand{\qZ}{F^{-1}_\rZ}
\newcommand{\qY}{F^{-1}_\rY}
\newcommand{\dZ}{f}
\newcommand{\dY}{f_\rY}
\newcommand{\hatdY}{\hat{f}_{\rY, N}}
\newcommand{\dYx}{f_{\rY \mid \rX = \rx}}
\newcommand{\hazY}{\lambda_\rY}
\newcommand{\HazY}{\Lambda_\rY}
\newcommand{\HazYx}{\Lambda_{\rY \mid \rX = \rx}}

\newcommand{\hathazY}{\hat{\lambda}_{\rY, N}}
\newcommand{\hatHazY}{\hat{\Lambda}_{\rY, N}}
%% measures
\newcommand{\measureY}{\mu}
\newcommand{\lebesgue}{\mu_L}
\newcommand{\counting}{\mu_C}
%% trafo
\newcommand{\g}{g}
\newcommand{\h}{h}
\newcommand{\s}{\svec}
\newcommand{\hY}{h_\rY}
\newcommand{\hx}{h_\rx}
\newcommand{\hs}{\mathcal{H}}
\newcommand{\basisy}{\avec}
\newcommand{\bern}[1]{\avec_{\text{Bs},#1}}
\newcommand{\bernx}[1]{\bvec_{\text{Bs},#1}}
\newcommand{\basisx}{\bvec}
\newcommand{\basisyx}{\cvec}
\newcommand{\m}{m}
\newcommand{\lik}{\mathcal{L}}
\newcommand{\parm}{\varthetavec}
\newcommand{\eparm}{\vartheta}
\newcommand{\dimparm}{P}
\newcommand{\dimparmx}{Q}
\newcommand{\dimparmvar}{M}
\newcommand{\dimparmrand}{R}
\newcommand{\shiftparm}{\betavec}
\newcommand{\scaleparm}{\xivec}
\newcommand{\varparm}{\gammavec}
\newcommand{\eshiftparm}{\beta}
\newcommand{\escaleparm}{\xi}

\newcommand{\ie}{\textit{i.e.}~}
\newcommand{\eg}{\textit{e.g.}~}

\renewcommand{\Prob}{\mathbb{P}}
\newcommand{\Ex}{\mathbb{E}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\NN}{\mathbb{N}}
\newcommand{\eps}{\varepsilon}
\newcommand{\prodname}{tensor }
\newcommand{\Null}{\mathbf{0}}
\newcommand{\FI}{\mF}

\usepackage{dsfont}
\newcommand{\I}{\mathds{1}}


\def \dsP {\text{$\mathds{P}$}}
\def \dsE {\text{$\mathds{E}$}}
\def \dsR {\text{$\mathds{R}$}}
\def \dsN {\text{$\mathds{N}$}}


% Math Operators

 \DeclareMathOperator{\logit}{logit}
 \DeclareMathOperator{\LRT}{LRT}
 \DeclareMathOperator{\RLRT}{RLRT}
 \DeclareMathOperator{\Cov}{Cov}
 \DeclareMathOperator{\Cor}{Cor}
 \DeclareMathOperator{\Var}{Var}
 \DeclareMathOperator{\EW}{\dsE}
 \DeclareMathOperator{\D}{D}
 \DeclareMathOperator{\Bias}{Bias}
 \DeclareMathOperator{\MSE}{MSE}
 \DeclareMathOperator{\PLS}{PLS}
 \DeclareMathOperator{\rank}{rank}
 \DeclareMathOperator{\ncol}{ncol}
 \DeclareMathOperator{\pen}{pen}
 \DeclareMathOperator{\const}{const}
 \DeclareMathOperator{\diag}{diag}
 \DeclareMathOperator{\blockdiag}{blockdiag}
 \DeclareMathOperator{\df}{df}
 \DeclareMathOperator{\trace}{tr}
 \DeclareMathOperator{\iid}{i.i.d.}
 \DeclareMathOperator{\ind}{ind.}
 \DeclareMathOperator{\obs}{obs}
 \DeclareMathOperator{\acos}{acos}
 \DeclareMathOperator{\spat}{spat}
 \DeclareMathOperator{\fix}{{fix}}
 \DeclareMathOperator{\ran}{{ran}}
 \DeclareMathOperator*{\argmin}{{arg\,min}}
 \DeclareMathOperator*{\argmax}{{arg\,max}}
 \DeclareMathOperator{\BIC}{{BIC}}
 \DeclareMathOperator{\DIC}{{DIC}}
 \DeclareMathOperator{\AIC}{{AIC}}
 \DeclareMathOperator{\mAIC}{{mAIC}}
 \DeclareMathOperator{\cAIC}{{cAIC}}

% Distributions

 \DeclareMathOperator{\ND}{N}
 \DeclareMathOperator{\TND}{TN}
 \DeclareMathOperator{\UD}{U}
 \DeclareMathOperator{\GaD}{Ga}
 \DeclareMathOperator{\tD}{t}
 \DeclareMathOperator{\IGD}{IG}
 \DeclareMathOperator{\IWD}{IW}
 \DeclareMathOperator{\PoD}{Po}
 \DeclareMathOperator{\ExpD}{Exp}
 \DeclareMathOperator{\LapD}{Lap}
 \DeclareMathOperator{\MuD}{Mu}
 \DeclareMathOperator{\DirD}{Dir}
 \DeclareMathOperator{\PDD}{PD}
 \DeclareMathOperator{\BeD}{Be}
 \DeclareMathOperator{\BD}{B}
 \DeclareMathOperator{\DPD}{DP}
 \DeclareMathOperator{\KSD}{KS}
 \DeclareMathOperator{\SL}{SL}
 \DeclareMathOperator{\MEV}{MEV}
 \DeclareMathOperator{\SW}{SW}
 \DeclareMathOperator{\Chi1}{\chi^2_1}
 \DeclareMathOperator{\WD}{W}


% Boldface vectors and matrices

\def \avec {\text{\boldmath$a$}}    \def \mA {\text{\boldmath$A$}}
\def \bvec {\text{\boldmath$b$}}    \def \mB {\text{\boldmath$B$}}
\def \cvec {\text{\boldmath$c$}}    \def \mC {\text{\boldmath$C$}}
\def \dvec {\text{\boldmath$d$}}    \def \mD {\text{\boldmath$D$}}
\def \evec {\text{\boldmath$e$}}    \def \mE {\text{\boldmath$E$}}
\def \fvec {\text{\boldmath$f$}}    \def \mF {\text{\boldmath$F$}}
\def \gvec {\text{\boldmath$g$}}    \def \mG {\text{\boldmath$G$}}
\def \hvec {\text{\boldmath$h$}}    \def \mH {\text{\boldmath$H$}}
\def \ivec {\text{\boldmath$i$}}    \def \mI {\text{\boldmath$I$}}
\def \jvec {\text{\boldmath$j$}}    \def \mJ {\text{\boldmath$J$}}
\def \kvec {\text{\boldmath$k$}}    \def \mK {\text{\boldmath$K$}}
\def \lvec {\text{\boldmath$l$}}    \def \mL {\text{\boldmath$L$}}
\def \mvec {\text{\boldmath$m$}}    \def \mM {\text{\boldmath$M$}}
\def \nvec {\text{\boldmath$n$}}    \def \mN {\text{\boldmath$N$}}
\def \ovec {\text{\boldmath$o$}}    \def \mO {\text{\boldmath$O$}}
\def \pvec {\text{\boldmath$p$}}    \def \mP {\text{\boldmath$P$}}
\def \qvec {\text{\boldmath$q$}}    \def \mQ {\text{\boldmath$Q$}}
\def \rvec {\text{\boldmath$r$}}    \def \mR {\text{\boldmath$R$}}
\def \svec {\text{\boldmath$s$}}    \def \mS {\text{\boldmath$S$}}
\def \tvec {\text{\boldmath$t$}}    \def \mT {\text{\boldmath$T$}}
\def \uvec {\text{\boldmath$u$}}    \def \mU {\text{\boldmath$U$}}
\def \vvec {\text{\boldmath$v$}}    \def \mV {\text{\boldmath$V$}}
\def \wvec {\text{\boldmath$w$}}    \def \mW {\text{\boldmath$W$}}
\def \xvec {\text{\boldmath$x$}}    \def \mX {\text{\boldmath$X$}}
\def \yvec {\text{\boldmath$y$}}    \def \mY {\text{\boldmath$Y$}}
\def \zvec {\text{\boldmath$z$}}    \def \mZ {\text{\boldmath$Z$}}

 \def \calA {\mathcal A}
 \def \calB {\mathcal B}
 \def \calC {\mathcal C}
 \def \calD {\mathcal D}
 \def \calE {\mathcal E}
 \def \calF {\mathcal F}
 \def \calG {\mathcal G}
 \def \calH {\mathcal H}
 \def \calI {\mathcal I}
 \def \calJ {\mathcal J}
 \def \calK {\mathcal K}
 \def \calL {\mathcal L}
 \def \calM {\mathcal M}
 \def \calN {\mathcal N}
 \def \calO {\mathcal O}
 \def \calP {\mathcal P}
 \def \calQ {\mathcal Q}
 \def \calR {\mathcal R}
 \def \calS {\mathcal S}
 \def \calT {\mathcal T}
 \def \calU {\mathcal U}
 \def \calV {\mathcal V}
 \def \calW {\mathcal W}
 \def \calX {\mathcal X}
 \def \calY {\mathcal Y}
 \def \calZ {\mathcal Z}

\def \ahatvec {\text{\boldmath$\hat a$}}    \def \mhatA {\text{\boldmath$\hat A$}}
\def \bhatvec {\text{\boldmath$\hat b$}}    \def \mhatB {\text{\boldmath$\hat B$}}
\def \chatvec {\text{\boldmath$\hat c$}}    \def \mhatC {\text{\boldmath$\hat C$}}
\def \dhatvec {\text{\boldmath$\hat d$}}    \def \mhatD {\text{\boldmath$\hat D$}}
\def \ehatvec {\text{\boldmath$\hat e$}}    \def \mhatE {\text{\boldmath$\hat E$}}
\def \fhatvec {\text{\boldmath$\hat f$}}    \def \mhatF {\text{\boldmath$\hat F$}}
\def \ghatvec {\text{\boldmath$\hat g$}}    \def \mhatG {\text{\boldmath$\hat G$}}
\def \hhatvec {\text{\boldmath$\hat h$}}    \def \mhatH {\text{\boldmath$\hat H$}}
\def \ihatvec {\text{\boldmath$\hat i$}}    \def \mhatI {\text{\boldmath$\hat I$}}
\def \jhatvec {\text{\boldmath$\hat j$}}    \def \mhatJ {\text{\boldmath$\hat J$}}
\def \khatvec {\text{\boldmath$\hat k$}}    \def \mhatK {\text{\boldmath$\hat K$}}
\def \lhatvec {\text{\boldmath$\hat l$}}    \def \mhatL {\text{\boldmath$\hat L$}}
\def \mhatvec {\text{\boldmath$\hat m$}}    \def \mhatM {\text{\boldmath$\hat M$}}
\def \nhatvec {\text{\boldmath$\hat n$}}    \def \mhatN {\text{\boldmath$\hat N$}}
\def \ohatvec {\text{\boldmath$\hat o$}}    \def \mhatO {\text{\boldmath$\hat O$}}
\def \phatvec {\text{\boldmath$\hat p$}}    \def \mhatP {\text{\boldmath$\hat P$}}
\def \qhatvec {\text{\boldmath$\hat q$}}    \def \mhatQ {\text{\boldmath$\hat Q$}}
\def \rhatvec {\text{\boldmath$\hat r$}}    \def \mhatR {\text{\boldmath$\hat R$}}
\def \shatvec {\text{\boldmath$\hat s$}}    \def \mhatS {\text{\boldmath$\hat S$}}
\def \thatvec {\text{\boldmath$\hat t$}}    \def \mhatT {\text{\boldmath$\hat T$}}
\def \uhatvec {\text{\boldmath$\hat u$}}    \def \mhatU {\text{\boldmath$\hat U$}}
\def \vhatvec {\text{\boldmath$\hat v$}}    \def \mhatV {\text{\boldmath$\hat V$}}
\def \whatvec {\text{\boldmath$\hat w$}}    \def \mhatW {\text{\boldmath$\hat W$}}
\def \xhatvec {\text{\boldmath$\hat x$}}    \def \mhatX {\text{\boldmath$\hat X$}}
\def \yhatvec {\text{\boldmath$\hat y$}}    \def \mhatY {\text{\boldmath$\hat Y$}}
\def \zhatvec {\text{\boldmath$\hat z$}}    \def \mhatZ {\text{\boldmath$\hat Z$}}


\def \atildevec {\text{\boldmath$\tilde a$}}    \def \mtildeA {\text{\boldmath$\tilde A$}}
\def \btildevec {\text{\boldmath$\tilde b$}}    \def \mtildeB {\text{\boldmath$\tilde B$}}
\def \ctildevec {\text{\boldmath$\tilde c$}}    \def \mtildeC {\text{\boldmath$\tilde C$}}
\def \dtildevec {\text{\boldmath$\tilde d$}}    \def \mtildeD {\text{\boldmath$\tilde D$}}
\def \etildevec {\text{\boldmath$\tilde e$}}    \def \mtildeE {\text{\boldmath$\tilde E$}}
\def \ftildevec {\text{\boldmath$\tilde f$}}    \def \mtildeF {\text{\boldmath$\tilde F$}}
\def \gtildevec {\text{\boldmath$\tilde g$}}    \def \mtildeG {\text{\boldmath$\tilde G$}}
\def \htildevec {\text{\boldmath$\tilde h$}}    \def \mtildeH {\text{\boldmath$\tilde H$}}
\def \itildevec {\text{\boldmath$\tilde i$}}    \def \mtildeI {\text{\boldmath$\tilde I$}}
\def \jtildevec {\text{\boldmath$\tilde j$}}    \def \mtildeJ {\text{\boldmath$\tilde J$}}
\def \ktildevec {\text{\boldmath$\tilde k$}}    \def \mtildeK {\text{\boldmath$\tilde K$}}
\def \ltildevec {\text{\boldmath$\tilde l$}}    \def \mtildeL {\text{\boldmath$\tilde L$}}
\def \mtildevec {\text{\boldmath$\tilde m$}}    \def \mtildeM {\text{\boldmath$\tilde M$}}
\def \ntildevec {\text{\boldmath$\tilde n$}}    \def \mtildeN {\text{\boldmath$\tilde N$}}
\def \otildevec {\text{\boldmath$\tilde o$}}    \def \mtildeO {\text{\boldmath$\tilde O$}}
\def \ptildevec {\text{\boldmath$\tilde p$}}    \def \mtildeP {\text{\boldmath$\tilde P$}}
\def \qtildevec {\text{\boldmath$\tilde q$}}    \def \mtildeQ {\text{\boldmath$\tilde Q$}}
\def \rtildevec {\text{\boldmath$\tilde r$}}    \def \mtildeR {\text{\boldmath$\tilde R$}}
\def \stildevec {\text{\boldmath$\tilde s$}}    \def \mtildeS {\text{\boldmath$\tilde S$}}
\def \ttildevec {\text{\boldmath$\tilde t$}}    \def \mtildeT {\text{\boldmath$\tilde T$}}
\def \utildevec {\text{\boldmath$\tilde u$}}    \def \mtildeU {\text{\boldmath$\tilde U$}}
\def \vtildevec {\text{\boldmath$\tilde v$}}    \def \mtildeV {\text{\boldmath$\tilde V$}}
\def \wtildevec {\text{\boldmath$\tilde w$}}    \def \mtildeW {\text{\boldmath$\tilde W$}}
\def \xtildevec {\text{\boldmath$\tilde x$}}    \def \mtildeX {\text{\boldmath$\tilde X$}}
\def \ytildevec {\text{\boldmath$\tilde y$}}    \def \mtildeY {\text{\boldmath$\tilde Y$}}
\def \ztildevec {\text{\boldmath$\tilde z$}}    \def \mtildeZ {\text{\boldmath$\tilde Z$}}

\def \alphavec        {\text{\boldmath$\alpha$}}
\def \betavec         {\text{\boldmath$\beta$}}
\def \gammavec        {\text{\boldmath$\gamma$}}
\def \deltavec        {\text{\boldmath$\delta$}}
\def \epsilonvec      {\text{\boldmath$\epsilon$}}
\def \varepsilonvec   {\text{\boldmath$\varepsilon$}}
\def \zetavec         {\text{\boldmath$\zeta$}}
\def \etavec          {\text{\boldmath$\eta$}}
\def \thetavec        {\text{\boldmath$\theta$}}
\def \varthetavec     {\text{\boldmath$\vartheta$}}
\def \iotavec         {\text{\boldmath$\iota$}}
\def \kappavec        {\text{\boldmath$\kappa$}}
\def \lambdavec       {\text{\boldmath$\lambda$}}
\def \muvec           {\text{\boldmath$\mu$}}
\def \nuvec           {\text{\boldmath$\nu$}}
\def \xivec           {\text{\boldmath$\xi$}}
\def \pivec           {\text{\boldmath$\pi$}}
\def \varpivec        {\text{\boldmath$\varpi$}}
\def \rhovec          {\text{\boldmath$\rho$}}
\def \varrhovec       {\text{\boldmath$\varrho$}}
\def \sigmavec        {\text{\boldmath$\sigma$}}
\def \varsigmavec     {\text{\boldmath$\varsigma$}}
\def \tauvec          {\text{\boldmath$\tau$}}
\def \upsilonvec      {\text{\boldmath$\upsilon$}}
\def \phivec          {\text{\boldmath$\phi$}}
\def \varphivec       {\text{\boldmath$\varphi$}}
\def \psivec          {\text{\boldmath$\psi$}}
\def \chivec          {\text{\boldmath$\chi$}}
\def \omegavec        {\text{\boldmath$\omega$}}

\def \alphahatvec        {\text{\boldmath$\hat \alpha$}}
\def \betahatvec         {\text{\boldmath$\hat \beta$}}
\def \gammahatvec        {\text{\boldmath$\hat \gamma$}}
\def \deltahatvec        {\text{\boldmath$\hat \delta$}}
\def \epsilonhatvec      {\text{\boldmath$\hat \epsilon$}}
\def \varepsilonhatvec   {\text{\boldmath$\hat \varepsilon$}}
\def \zetahatvec         {\text{\boldmath$\hat \zeta$}}
\def \etahatvec          {\text{\boldmath$\hat \eta$}}
\def \thetahatvec        {\text{\boldmath$\hat \theta$}}
\def \varthetahatvec     {\text{\boldmath$\hat \vartheta$}}
\def \iotahatvec         {\text{\boldmath$\hat \iota$}}
\def \kappahatvec        {\text{\boldmath$\hat \kappa$}}
\def \lambdahatvec       {\text{\boldmath$\hat \lambda$}}
\def \muhatvec           {\text{\boldmath$\hat \mu$}}
\def \nuhatvec           {\text{\boldmath$\hat \nu$}}
\def \xihatvec           {\text{\boldmath$\hat \xi$}}
\def \pihatvec           {\text{\boldmath$\hat \pi$}}
\def \varpihatvec        {\text{\boldmath$\hat \varpi$}}
\def \rhohatvec          {\text{\boldmath$\hat \rho$}}
\def \varrhohatvec       {\text{\boldmath$\hat \varrho$}}
\def \sigmahatvec        {\text{\boldmath$\hat \sigma$}}
\def \varsigmahatvec     {\text{\boldmath$\hat \varsigma$}}
\def \tauhatvec          {\text{\boldmath$\hat \tau$}}
\def \upsilonhatvec      {\text{\boldmath$\hat \upsilon$}}
\def \phihatvec          {\text{\boldmath$\hat \phi$}}
\def \varphihatvec       {\text{\boldmath$\hat \varphi$}}
\def \psihatvec          {\text{\boldmath$\hat \psi$}}
\def \chihatvec          {\text{\boldmath$\hat \chi$}}
\def \omegahatvec        {\text{\boldmath$\hat \omega$}}

\def \alphatildevec        {\text{\boldmath$\tilde \alpha$}}
\def \betatildevec         {\text{\boldmath$\tilde \beta$}}
\def \gammatildevec        {\text{\boldmath$\tilde \gamma$}}
\def \deltatildevec        {\text{\boldmath$\tilde \delta$}}
\def \epsilontildevec      {\text{\boldmath$\tilde \epsilon$}}
\def \varepsilontildevec   {\text{\boldmath$\tilde \varepsilon$}}
\def \zetatildevec         {\text{\boldmath$\tilde \zeta$}}
\def \etatildevec          {\text{\boldmath$\tilde \eta$}}
\def \thetatildevec        {\text{\boldmath$\tilde \theta$}}
\def \varthetatildevec     {\text{\boldmath$\tilde \vartheta$}}
\def \iotatildevec         {\text{\boldmath$\tilde \iota$}}
\def \kappatildevec        {\text{\boldmath$\tilde \kappa$}}
\def \lambdatildevec       {\text{\boldmath$\tilde \lambda$}}
\def \mutildevec           {\text{\boldmath$\tilde \mu$}}
\def \nutildevec           {\text{\boldmath$\tilde \nu$}}
\def \xitildevec           {\text{\boldmath$\tilde \xi$}}
\def \pitildevec           {\text{\boldmath$\tilde \pi$}}
\def \varpitildevec        {\text{\boldmath$\tilde \varpi$}}
\def \rhotildevec          {\text{\boldmath$\tilde \rho$}}
\def \varrhotildevec       {\text{\boldmath$\tilde \varrho$}}
\def \sigmatildevec        {\text{\boldmath$\tilde \sigma$}}
\def \varsigmatildevec     {\text{\boldmath$\tilde \varsigma$}}
\def \tautildevec          {\text{\boldmath$\tilde \tau$}}
\def \upsilontildevec      {\text{\boldmath$\tilde \upsilon$}}
\def \phitildevec          {\text{\boldmath$\tilde \phi$}}
\def \varphitildevec       {\text{\boldmath$\tilde \varphi$}}
\def \psitildevec          {\text{\boldmath$\tilde \psi$}}
\def \chitildevec          {\text{\boldmath$\tilde \chi$}}
\def \omegatildevec        {\text{\boldmath$\tilde \omega$}}

\def \mGamma   {\mathbf{\Gamma}}
\def \mDelta   {\mathbf{\Delta}}
\def \mTheta   {\mathbf{\Theta}}
\def \mLambda  {\mathbf{\Lambda}}
\def \mXi      {\mathbf{\Xi}}
\def \mPi      {\mathbf{\Pi}}
\def \mSigma   {\mathbf{\Sigma}}
\def \mUpsilon {\mathbf{\Upsilon}}
\def \mPhi     {\mathbf{\Phi}}
\def \mPsi     {\mathbf{\Psi}}
\def \mOmega   {\mathbf{\Omega}}

\def \mhatGamma   {\mathbf{\hat \Gamma}}
\def \mhatDelta   {\mathbf{\hat \Delta}}
\def \mhatTheta   {\mathbf{\hat \Theta}}
\def \mhatLambda  {\mathbf{\hat \Lambda}}
\def \mhatXi      {\mathbf{\hat \Xi}}
\def \mhatPi      {\mathbf{\hat \Pi}}
\def \mhatSigma   {\mathbf{\hat \Sigma}}
\def \mhatUpsilon {\mathbf{\hat \Upsilon}}
\def \mhatPhi     {\mathbf{\hat \Phi}}
\def \mhatPsi     {\mathbf{\hat \Psi}}
\def \mhatOmega   {\mathbf{\hat \Omega}}

\def \nullvec {\mathbf{0}}
\def \onevec {\mathbf{1}}

%%% theorems
\newtheorem{lem}{Lemma}
\newtheorem{thm}{Theorem}
\newtheorem{coro}{Corollary}
\newtheorem{defn}{Definition}
\newtheorem{remark}{Remark}

\newcommand{\ubar}[1]{\underaccent{\bar}{#1}}


\renewcommand{\thefootnote}{}

%% code commands
\newcommand{\Rclass}[1]{`\code{#1}'}
%% JSS
\author{Luisa Barbanti \\ Universit\"at Z\"urich \And Torsten Hothorn \\ Universit\"at Z\"urich}
\Plainauthor{Barbanti and Hothorn}

\title{Some Applications of Marginally Interpretable Linear 
       Transformation Models for Clustered Observations}
\Plaintitle{Marginally Interpretable Transformation Models}
\Shorttitle{Marginally Interpretable Transformation Models}

\Abstract{
Owing to their generality, transformation models can be used to set-up and
compute many interesting regression models for discrete and continuous responses.  This
document focuses on the analysis of clustered observations.  Marginal
predictive distributions are defined by transformation models and their
joint normal distribution depends on a structured covariance matrix. 
Applications with skewed, bounded, and survival continuous outcomes as well
as binary and ordered categorical responses are presented. Data is analysed
by a proof-of-concept implementation of parametric linear transformation models for
clustered observations available in the \pkg{tram} add-on package to the
\proglang{R} system for statistical computing.
}

\Keywords{conditional mixed models, marginal models, marginal predictive
distributions, survival analysis, categorical data analysis}
\Plainkeywords{conditional mixed models, marginal models, marginal predictive
distributions, survival analysis, categorical data analysis}

\Address{
  Luisa Barbanti, Torsten Hothorn\\
  Institut f\"ur Epidemiologie, Biostatistik und Pr\"avention \\
  Universit\"at Z\"urich \\
  Hirschengraben 84, CH-8001 Z\"urich, Switzerland \\
  \texttt{Torsten.Hothorn@R-project.org}
}

\begin{document}

<<mtram-citation, echo = FALSE>>=
year <- substr(packageDescription("tram")$Date, 1, 4)
version <- packageDescription("tram")$Version
@
\footnote{Please cite this document as: Luisa Barbanti and Torsten Hothorn (\Sexpr{year})
Some Applications of Marginally Interpretable Linear 
Transformation Models for Clustered Observations.
\textsf{R} package vignette version \Sexpr{version}, 
URL \url{https://CRAN.R-project.org/package=tram}.}

% \input{todo}

<<fail, results = "asis", echo = FALSE>>=
if (any(!pkgs))
{
    cat(paste("Package(s)", paste(names(pkgs)[!pkgs], collapse = ", "), 
        "not available, stop processing.",
        "\\end{document}\n"))
    knitr::knit_exit()
}
if (!interactive() && .Platform$OS.type != "unix")
{
    cat("Vignette only compiled under Unix alikes.")
    knitr::knit_exit()
}
@


<<mtram-setup, echo = FALSE, results = "hide", message = FALSE, warning = FALSE>>=
trellis.par.set(list(plot.symbol = list(col=1,pch=20, cex=0.7),
                     box.rectangle = list(col=1),
                     box.umbrella = list(lty=1, col=1),
                     strip.background = list(col = "white")))
ltheme <- canonical.theme(color = FALSE)     ## in-built B&W theme
ltheme$strip.background$col <- "transparent" ## change strip bg
lattice.options(default.theme = ltheme)

knitr::opts_chunk$set(echo = TRUE, results = 'markup', error = FALSE,
                      warning = FALSE, message = FALSE,
                      tidy = FALSE, cache = FALSE, size = "small",
                      fig.width = 6, fig.height = 4, fig.align = "center",
                      out.width = NULL, ###'.6\\linewidth', 
                      out.height = NULL,
                      fig.scap = NA)
knitr::render_sweave()  # use Sweave environments
knitr::set_header(highlight = '')  # do not \usepackage{Sweave}
## R settings
options(prompt = "R> ", continue = "+  ", useFancyQuotes = FALSE)  # JSS style
options(width = 75)

### ecdf plots
myprepanel <- function (x, y, f.value = NULL, ...) 
{
    ans <- prepanel.default.qqmath(x, f.value = f.value, distribution = qunif)
    with(ans, list(xlim = ylim, ylim = c(0, 1), dx = dy, dy = dx))
}


mypanel <- function (x, y, f.value = NULL, type = "s", groups = NULL, qtype = 7, 
    ref = TRUE, ...) 
{
    if (ref) {
        reference.line <- trellis.par.get("reference.line")
        do.call(panel.abline, c(list(h = c(0, 1)), reference.line))
    }
    x <- as.numeric(x)
    distribution <- qunif
    nobs <- sum(!is.na(x))
    if (!is.null(groups)) {
        panel.superpose(x, y = NULL, f.value = f.value, type = type, 
            distribution = distribution, qtype = qtype, groups = groups, 
            panel.groups = panel.ecdfplot, ...)
    }
    else if (nobs) {
        if (is.null(f.value)) {
            panel.xyplot(x = sort(x), y = cumsum(y[order(x)]) / sum(y),
                type = type, ...)
        }
        else {
            p <- if (is.numeric(f.value)) 
                f.value
            else f.value(nobs)
            panel.xyplot(x = quantile(x, p, names = FALSE, type = qtype, 
                na.rm = TRUE), y = distribution(p), type = type, 
                ...)
        }
    }
}
col <- diverge_hcl(2, h = c(246, 40), c = 120, l = c(65, 90), alpha = .75)
@


\section{Introduction}

The purpose of this document is to compare marginally interpretable linear
transformation models for clustered observations
\citep{Hothorn_2019_mtram} to conventional conditional
formulations of mixed-effects models where such an overlap exists.  In
addition, novel transformation models going beyond the capabilities of
convential mixed-effects models are estimated and interpreted.  A
proof-of-concept implementation available in package
\pkg{tram} \citep{pkg:tram} is applied. 
% use mtram in package tram
The results presented in this
document can be reproduced from the \code{mtram} demo
<<mtram-vignette, eval = FALSE>>=
install.packages("tram")
demo("mtram", package = "tram")
@

\section{Normal and Non-normal Mixed-effects Models}

First we consider mixed-effects models for reaction times in the sleep
deprivation study \citep{Belenky_Wesensten_Thorne_2003}.  The average
reaction times to a specific task over several days of sleep deprivation are
given for $i = 1, \dots, N = 18$ subjects in Figure~\ref{fig:sleepstudy}.
The data are often used to illustrate
conditional normal linear mixed-effects models with correlated random
intercepts and slopes. %of the form (\ref{fm:normal})

\begin{figure}[t]
<<mtram-sleep-plot, echo = FALSE>>=
library("lme4")
xyplot(Reaction ~ Days | Subject, data = sleepstudy, 
       xlab = "Days of sleep deprivation", ylab = "Average reaction time (in ms)")
@
\caption{Sleep deprivation: Average reaction times to a specific task over several days 
         of sleep deprivation for $18$ subjects from
         \cite{Belenky_Wesensten_Thorne_2003}. \label{fig:sleepstudy}}
\end{figure}

The classical normal linear random-intercept/random-slope model, treating
the study participants as independent observations, is fitted by maximum
likelihood to the data using the \cmd{lmer} function from the \pkg{lme4}
add-on package \citep{pkg:lme4}:
%
<<mtram-sleep_lmer, cache = FALSE>>=
sleep_lmer <- lmer(Reaction ~ Days + (Days | Subject), 
                   data = sleepstudy, REML = FALSE)
@
%
The corresponding conditional model for subject $i$ reads
%
\begin{eqnarray*}
\Prob(\text{Reaction} \le \ry \mid \text{day}, i) = \Phi\left(\frac{\ry -
\alpha - \beta \text{day} - \alpha_i - \beta_i \text{day}}{\sigma}\right),
\quad (\alpha_i, \beta_i) \sim \ND_2(\nullvec, \mG(\varparm))
\end{eqnarray*}
%
with $\sigma^{-2}\mG = \mLambda(\varparm) \mLambda(\varparm)^\top$ and
%
\begin{eqnarray*}
\mLambda(\varparm) = \left( \begin{array}{cc}
    \gamma_1 & 0  \\
    \gamma_2 & \gamma_3
\end{array} \right), \quad \varparm = (\gamma_1, \gamma_2, \gamma_3)^\top.
\end{eqnarray*}

The same model, however using the alternative parameterisation and an
independent (of \pkg{lme4}, only the \cmd{update} method for Cholesky
factors is reused) gradient-based maximisation of the log-likelihood, is estimated 
in a two-step approach as
<<mtram-tram>>=
library("tram")
@
<<mtram-sleep_mtram, cache = FALSE>>=
sleep_LM <- Lm(Reaction ~ Days, data = sleepstudy)
sleep_LMmer <- mtram(sleep_LM, ~ (Days | Subject), data = sleepstudy)
@
%
The first call to \cmd{Lm} computes the equivalent of a normal linear
regression model parameterised as a linear transformation model
\emph{ignoring} the longitudinal nature of the observations. The purpose if
to set-up the necessary model infrastructure (model matrices, inverse link
functions, etc.) and to compute reasonable starting values for the fixed
effects. The second call to \cmd{mtram} specifies the random effects
structure (here a correlated pair of random intercept for subject 
and random slope for days) and optimises the likelihood for all model
parameters $\eparm_1, \tilde{\alpha}, \tilde{\beta}$, and $\varparm$
in the model (here also looking at the conditional model for subject $i$)
%
\begin{eqnarray*}
\Prob(\text{Reaction} \le \ry \mid \text{day}, i) = \Phi\left(\eparm_1 \ry + \tilde{\alpha} - \tilde{\beta} \text{day} - \tilde{\alpha}_i - \tilde{\beta}_i \text{day}\right),
\quad (\tilde{\alpha}_i, \tilde{\beta}_i) \sim \ND_2(\nullvec, \mLambda(\varparm) \mLambda(\varparm))
\end{eqnarray*}
%
that is, all fixed and random effect parameters are 
divided by the residual standard deviation $\sigma$ (this is the
reparameterisation applied by \cmd{Lm}).
Of course, the parameter $\eparm_1$, the inverse residual standard
deviation, is ensured to be positive via an additional constraint in the
optimiser maximising the log-likelihood.
%

The log-likelihoods of the two models fitted by \cmd{lmer} and \cmd{mtram} 
are very close
<<mtram-sleep_cmp>>=
logLik(sleep_lmer)
logLik(sleep_LMmer)
@
Looking at the model coefficients, the two procedures lead to almost
identical inverse residual standard deviations
<<mtram-sleep_sd>>=
(sdinv <- 1 / summary(sleep_lmer)$sigma)
coef(sleep_LMmer)["Reaction"]
@
and fixed effects (the slope can be interpreted as inverse coefficient of
variation)
<<mtram-sleep_beta>>=
fixef(sleep_lmer) * c(-1, 1) * sdinv
coef(sleep_LMmer)[c("(Intercept)", "Days")]
@
The random-effect parameters $\varparm$ are also reasonably close
<<mtram-sleep_varparm>>=
sleep_lmer@theta
coef(sleep_LMmer)[-(1:3)]
@
Consequently, the variance-covariance and correlation matrices
<<mtram-sleep_varcorr>>=
sleep_LMmer$G * (1 / sdinv)^2
cov2cor(sleep_LMmer$G * (1 / sdinv)^2)
unclass(VarCorr(sleep_lmer))$Subject
@
are practically equivalent. This result indicates the correctness of the
alternative implementation of normal linear mixed-effects models in the
transformation model framework: \cmd{mtram} reuses some infrastructure from
\pkg{lme4} and \pkg{Matrix}, most importantly fast update methods for
Cholesky factors, but the likelihood and corresponding optimisation relies
on an independent implementation. So why are we doing this? Because
\cmd{mtram} is able to deal with models or likelihoods 
not available in \pkg{lme4}, for example the likelihood for
interval-censored observations.

Let's assume that the timing of the reaction times was less accurate than suggested by the
numerical representation of the results. The following code
<<mtram-sleep-Surv>>=
library("survival")
sleepstudy$Reaction_I <- with(sleepstudy, Surv(Reaction - 20, Reaction + 20, 
                                               type = "interval2"))
sleepstudy$Reaction_I[1:5]
@
converts the outcome to interval-censored values, where each interval has
length $40$. The above mixed model can now be estimated by maximising the
likelihood corresponding to interval-censored observations:
<<mtram-sleep-interval, cache = FALSE>>=
sleep_LM_I <- Lm(Reaction_I ~ Days, data = sleepstudy)
sleep_LMmer_I <- mtram(sleep_LM_I, ~ (Days | Subject), data = sleepstudy)
@
Of course, the log-likelihood changes (because this is a log-probability and
not a log-density of a continuous distribution) but the parameter estimates are reasonably close
<<mtram-sleep-interval-results>>=
logLik(sleep_LMmer_I)
coef(sleep_LMmer_I)
coef(sleep_LMmer)
@

The next question is if the normal assumption for reaction times is
appropriate. In the transformation world, this assumption is simple to
assess because we can easily (theoretically and in-silico) switch to 
the non-normal linear mixed-effects transformation model
%
\begin{eqnarray*}
\Prob(\text{Reaction} \le \ry \mid \text{day}, i) =
\Phi\left(\h(\ry) - \tilde{\beta} \text{day} - \tilde{\alpha}_i - \tilde{\beta}_i \text{day}\right),
\quad (\tilde{\alpha}_i, \tilde{\beta}_i) \sim \ND_2(\nullvec, \mLambda(\varparm) \mLambda(\varparm))
\end{eqnarray*}
%
where $\h(\ry) = \basisy(\ry)^\top \parm$ represents a monotone non-decreasing
transformation function. The function implementing such a more flexible
model in named in honor of the first paper on the analysis of
transformed responses by \cite{BoxCox_1964} but it \emph{does not} simply apply
what is known as a Box-Cox transformation. Bernstein polynomials
$\h(\ry) = \basisy(\ry)^\top \parm$ under suitable constraints 
\citep{Hothorn_Moest_Buehlmann_2017} are applied instead by
<<mtram-sleep_BoxCox, cache = FALSE>>=
sleep_BC <- BoxCox(Reaction ~ Days, data = sleepstudy)
sleep_BCmer <- mtram(sleep_BC, ~ (Days | Subject), data = sleepstudy)
logLik(sleep_BCmer)
@
%
The increase in the log-likelihood compared to the normal model is not a big
surprise.  Plotting the transformation function $\h(\ry) = \basisy(\ry)^\top \parm$ as
a function of reaction time can help to assess deviations from normality
because the latter assumption implies a linear transformation function. 
Figure~\ref{fig:sleepstudy_trafo} clearly indicates that models allowing a
certain skewness of reaction times will provide a better fit to the data.
This might also not come as a big surprise to experienced data analysts.

\begin{figure}[t]
<<mtram-sleep_BoxCoxPlot, echo = FALSE, fig.height = 5>>=
tmp <- as.mlt(sleep_BC)
cf <- coef(tmp)
coef(tmp) <- coef(sleep_BCmer)[names(cf)]
plot(tmp, newdata = data.frame(Days = 0), type = "trafo", col = "black",
     xlab = "Average reaction time (in ms)", ylab = expression(h(y)))
rug(sleepstudy$Reaction, col = rgb(.1, .1, .1, .1))
@
\caption{Sleep deprivation: Data-driven transformation $\hat{\h}$ 
         of average reaction times to sleep deprivation. The non-linearity
         induces a non-normal marginal distribution function of reaction times.
         \label{fig:sleepstudy_trafo}}
\end{figure}

Such probit-type mixed-effects models have been studied before, mostly by merging a
Box-Cox power transformation $\h$ with a grid-search over REML estimates
\citep{Gurka_Edwards_2006}, a conditional likelihood
\citep{Hutmacher_French_2011}, or a grid-search maximising the profile
likelihood \citep{Maruo_Yamaguchi_2017}.  Recently, \THcite{Tang, Wu, and
Chen}{Tang_Wu_Chen_2018} and \THcite{Wu and Wang}{Wu_Wang_2019}
proposed a monotone spline parameterisation of $\h$ in a Bayesian context.
The model presented here was estimated by simultaneously maximising the
log-likelihood \citep{Hothorn_2019_mtram} %(\ref{fm:cll}) 
with respect to the parameters $\parm$,
$\eshiftparm$, and $\varparm$. For a linear Bernstein polynomial of order
one, the models obtained with this approach and classical maximum likelihood
estimation in normal linear mixed-effects models are equivalent (up to
reparameterisation of $\eshiftparm$).% \citep{vign:tram}.

However, what does this finding mean in terms of a direct comparison of the
model and the data?  Looking at the marginal cumulative distribution
functions of average reaction time conditional on days of sleep deprivation
in Figure~\ref{fig:sleepstudy_ecdf} one finds that the non-normal marginal
transformation models provided a better fit to the marginal empirical
cumulative distribution functions than the normal marginal models. 
Especially for short reaction times in the first week of sleep deprivation,
the orange marginal cumulative distribution is much closer to the
empirical cumulative distribution function representing the marginal
distribution of reaction times at each single day of study participation.

\begin{figure}[t]
<<mtram-sleep_marginal, fig.height = 5, fig.width = 7, echo = FALSE>>=
days <- 0:9
q <- seq(from = min(sleepstudy$Reaction), to = max(sleepstudy$Reaction), 
         length.out = 100)
meth <- c("Normal linear mixed-effects model", "Non-normal linear transformation model")
ex <- expand.grid(Reaction = q, Days = days,
                  Method = factor(meth, levels = meth, labels = meth))
U <- cbind(1, days)
### Linear
tmp <- as.mlt(sleep_LM)
cf <- coef(tmp)
coef(tmp) <- coef(sleep_LMmer)[names(cf)]
SLM <- tcrossprod(U %*% as(sleep_BCmer$G, "matrix"), U) + diag(nrow(U))
sd <- sqrt(diag(SLM))
h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo")
prob_LM <- pnorm(t(t(h) / sd ))
### BoxCox
tmp <- as.mlt(sleep_BC)
cf <- coef(tmp)
coef(tmp) <- coef(sleep_BCmer)[names(cf)]
SBC <- tcrossprod(U %*% as(sleep_BCmer$G, "matrix"), U) + diag(nrow(U))
sd <- sqrt(diag(SBC))
h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo")
prob_BC <- pnorm(t(t(h) / sd ))
ex$prob <- c(prob_LM, prob_BC)
plotfun <- function(prob, data, ...) {
    fm <- as.formula(paste(prob, "~ Reaction | Days"))
    xyplot(fm, data = data, type = "l", 
        panel = function(x, y, subscripts, ...) {
            tmp <- subset(sleepstudy, Days == unique(nd[subscripts, "Days"]))
            mypanel(tmp$Reaction, rep(1, nrow(tmp)), lwd = 3, col = grey)
            panel.xyplot(x, y, subscripts = subscripts, ...)
    }, col = col,  xlab = "Average reaction time (in ms)", 
    ylab = "Marginal distribution function", lwd = 2, groups = Method, ...)
}
grey <- rgb(.75, .75, .75)
nd <- ex
plotfun("prob", ex, layout = c(5, 2), par.settings = simpleTheme(col=c(grey, col), lwd = 3),
  auto.key = list(text = c("Empirical cumulative distribution function", levels(nd$Method)), 
                  points = FALSE, lines = TRUE, space = "top"))
@
\caption{Sleep deprivation: Marginal distribution of reaction times, separately for each day of
         study participation. The grey step-function corresponds to the
         empirical cumulative distribution function, the blue line to the
         marginal cumulative distribution of a normal linear mixed-effects model, and
         the orange line to a non-normal linear mixed-effects transformation
         model. \label{fig:sleepstudy_ecdf}}
\end{figure}

It should be noted that the small positive correlation between random intercept and random slope
observed in the normal linear mixed-effects model turned into a negative
correlation in this non-normal model 
<<mtram-sleep_corr>>=
cov2cor(sleep_BCmer$G)
@
What is the uncertainty associated with this parameter? The correlation is a
non-linear function of $\varparm$ and therefore the direct computation of
confidence intervals questionable. However, we
can extract an estimate of the covariance of the estimated model parameters
from the model and, relying on the asymptotic normality of the maximum likelihood
estimators, we can sample from the asymptotic distribution
of the variance of the random intercept $\tilde{\alpha}$, the random slope
$\tilde{\beta}$, and their correlation
<<mtram-sleep_vcov>>=
library("mvtnorm")
VC <- vcov(sleep_BCmer)
idx <- (nrow(VC) - 2):nrow(VC)
Rcoef <- rmvnorm(1000, mean = coef(sleep_BCmer), sigma = VC)[,idx]
ret <- apply(Rcoef, 1, function(gamma) {
    L <- matrix(c(gamma[1:2], 0, gamma[3]), nrow = 2)
    V <- tcrossprod(L)
    c(diag(V), cov2cor(V)[1,2])
})
@
The $95\%$ confidence intervals
<<mtram-sleep_ci>>=
### variance random intercept
quantile(ret[1,], c(.025, .5, .975))
### variance random slope
quantile(ret[2,], c(.025, .5, .975))
### correlation random intercept / random slope
quantile(ret[3,], c(.025, .5, .975))
@
indicate rather strong unobserved heterogeneity affecting the intercept and
less pronouned variability in the slope. There is only weak information
about the correlation of the two random effects contained in the data.

The downside of this approach is that, although the model is nicely
interpretable on the scale of marginal or conditional distribution
functions, the direct interpretation of the fixed effect $\tilde{\beta}$ is
not very straightforward because it corresponds to the conditional mean
\emph{after} transforming the outcome.  This interpretability issue can be
addressed by exchanging the probit link to a logit link 
<<mtram-sleep_Colr, cache = FALSE>>=
sleep_C <- Colr(Reaction ~ Days, data = sleepstudy)
sleep_Cmer <- mtram(sleep_C, ~ (Days | Subject), data = sleepstudy)
logLik(sleep_Cmer)
@
Here, the in-sample log-likelihood increases compared to the probit model
and the marginal distributions obtained from this model are shown in
Figure~\ref{fig:sleepstudy_ecdf-2}.
How to interpret models of this type is discussed in Section~\ref{sec:logit}.

\begin{figure}[t]
<<mtram-sleep_marginal-2, fig.height = 5, fig.width = 7, echo = FALSE>>=
days <- 0:9
q <- seq(from = min(sleepstudy$Reaction), to = max(sleepstudy$Reaction), 
         length.out = 100)
meth <- c("Normal linear mixed-effects model", "Probit transformation model", 
          "Marginal logit transformation model")
ex <- expand.grid(Reaction = q, Days = days,
                  Method = factor(meth, levels = meth, labels = meth))
U <- cbind(1, days)
### Linear
tmp <- as.mlt(sleep_LM)
cf <- coef(tmp)
coef(tmp) <- coef(sleep_LMmer)[names(cf)]
SLM <- tcrossprod(U %*% as(sleep_BCmer$G, "matrix"), U) + diag(nrow(U))
sd <- sqrt(diag(SLM))
h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo")
prob_LM <- pnorm(t(t(h) / sd ))
### BoxCox
tmp <- as.mlt(sleep_BC)
cf <- coef(tmp)
coef(tmp) <- coef(sleep_BCmer)[names(cf)]
SBC <- tcrossprod(U %*% as(sleep_BCmer$G, "matrix"), U) + diag(nrow(U))
sd <- sqrt(diag(SBC))
h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo")
prob_BC <- pnorm(t(t(h) / sd ))
### Colr
tmp <- as.mlt(sleep_C)
cf <- coef(tmp)
coef(tmp) <- coef(sleep_Cmer)[names(cf)]
SBC <- tcrossprod(U %*% as(sleep_Cmer$G, "matrix"), U) + diag(nrow(U))
sd <- sqrt(diag(SBC))
h <- predict(tmp, newdata = data.frame(Days = days), q = q, type = "trafo")
prob_C <- plogis(t(t(h) / sd ))
ex$prob <- c(prob_LM, prob_BC, prob_C)
plotfun <- function(prob, data, ...) {
    fm <- as.formula(paste(prob, "~ Reaction | Days"))
    xyplot(fm, data = data, type = "l", 
        panel = function(x, y, subscripts, ...) {
            tmp <- subset(sleepstudy, Days == unique(nd[subscripts, "Days"]))
            mypanel(tmp$Reaction, rep(1, nrow(tmp)), lwd = 3, col = grey)
            panel.xyplot(x, y, subscripts = subscripts, ...)
    }, col = c(col, col[2]),  xlab = "Average reaction time (in ms)", 
    ylab = "Marginal distribution function", lwd = 2, groups = Method, lty =
    c(1, 1, 3), ...)
}
grey <- rgb(.75, .75, .75)
nd <- ex
plotfun("prob", ex, layout = c(5, 2), par.settings = simpleTheme(col=c(grey, col, col[2]), lwd =
3, lty = c(1, 1, 1, 3)),
  auto.key = list(text = c("Empirical cumulative distribution function", levels(nd$Method)), 
                  points = FALSE, lines = TRUE, space = "top"))
@
\caption{Sleep deprivation: Marginal distribution of reaction times, separately for each day of
         study participation. The grey step-function corresponds to the
         empirical cumulative distribution function, the blue line to the
         marginal cumulative distribution of a normal linear mixed-effects model, and
         the orange lines to a non-normal probit (solid) and marginal logit
         (dotted) transformation  model. \label{fig:sleepstudy_ecdf-2}}
\end{figure}


\section{Models for Binary Outcomes}

Here we compare different implementations of binary marginal and mixed models for
the notoriously difficult toe nail data \citep{backer_vroey_1998}. The
outcome was categorised to two levels \citep[this being probably the root of all
troubles, as quasi-separation issues have been
reported by][]{Sauter_Held_2016}.
A conditional density plot (Figure~\ref{fig:toenail}) suggests
an improvement in both treatment groups over time, however with a more rapid
advance in patients treated with terbinafine.

\begin{figure}[t]
<<mtram-toenail-plot, echo = FALSE, cache = FALSE>>=
data("toenail", package = "HSAUR3")
layout(matrix(1:2, ncol = 2))
trt <- levels(toenail$treatment)
cdplot(outcome ~ time, data = subset(toenail, treatment == trt[1]),
       main = trt[1], xlab = "Time", ylab = "Toe nail infection")
cdplot(outcome ~ time, data = subset(toenail, treatment == trt[2]),
       main = trt[2], xlab = "Time", ylab = "")
@
\caption{Toe nail data: Conditional density plot of two outcome classes
(none or mild vs.~moderate or severe) under two treatments.
\label{fig:toenail}}
\end{figure}

\subsection{Random Intercept Probit Models}

We are first interested in binary probit models featuring fixed main and
interaction effects $\eshiftparm_1$, $\eshiftparm_2$, and $\eshiftparm_3$ of
treatment (itraconazole vs.~terbinafine) and time.  Subject-specific random
intercept models were estimated by the \code{glmer} function from package
\pkg{lme4} \citep{pkg:lme4}, by the \code{glmmTMB} function from package
\pkg{glmmTMB} \citep{pkg:glmmTMB}, and by direct maximisation of the exact
discrete log-likelihood given in Appendix B of \cite{Hothorn_2019_mtram}.
% discrete log-likelihood (\ref{fm:dll}) given in Appendix~\ref{app:cens}.

The random intercept probit model fitted by Laplace and Adaptive
Gauss-Hermite Quadrature (AGQ) approximations to the
likelihood give quite different results:
<<mtram-toenail_glmer_RI, cache = FALSE>>=
### Laplace
toenail_glmer_RI_1 <- 
    glmer(outcome ~ treatment * time + (1 | patientID),
          data = toenail, family = binomial(link = "probit"), 
          nAGQ = 1)
summary(toenail_glmer_RI_1)
toenail_glmer_RI_1@theta

### Adaptive Gaussian Quadrature
toenail_glmer_RI_2 <- 
    glmer(outcome ~ treatment * time + (1 | patientID),
          data = toenail, family = binomial(link = "probit"), 
          nAGQ = 20)
summary(toenail_glmer_RI_2)
toenail_glmer_RI_2@theta
@

Package \pkg{glmmTMB} optimises the Laplace approximation utilising the
Template Model Builder \pkg{TMB} package:
<<mtram-toenail_glmmTMB_RI, cache = FALSE>>=
library("glmmTMB")
toenail_glmmTMB_RI_3 <- 
    glmmTMB(outcome ~ treatment * time + (1 | patientID),
         data = toenail, family = binomial(link = "probit"))
summary(toenail_glmmTMB_RI_3)
@
Surprisingly, this model is very close to the one obtained by AGQ and quite
off from the Laplace implementation in \pkg{lme4} (\code{nAGQ = 1} means
Laplace).

Because of the probit link, this binary generalised linear model is
equivalent to a linear transformation model and we can thus use the exact
likelihood implemented for the latter model in \cmd{mtram} for parameter
estimation (it is still a bit nasty to set-up a constant transformation function
$\h(\ry) = \alpha$, we plan to add a more convenient interface later)
<<mtram-toenail_mtram_RI, cache = FALSE>>=
m <- ctm(as.basis(~ outcome, data = toenail), 
         shifting = ~ treatment * time, 
         data = toenail, todistr = "Normal", negative = TRUE)
toenail_probit <- mlt(m, data = toenail, 
                      fixed = c("outcomemoderate or severe" = 0))
toenail_mtram_RI <- 
    mtram(toenail_probit, ~ (1 | patientID), data = toenail)
coef(toenail_mtram_RI)
@
For this random intercept model, the exact likelihood is defined as a
one-dimensional integral over the unit interval.  We use sparse grids
\citep{Heiss_Winschel_2008, pkg:SparseGrid} to approximate this integral. 
The integrand is defined by products of normal probabilities, which are
approximated as described by \cite{Matic_Radoicic_2018}.  It is important to
note that this likelihood can be computed as accurately as necessary whereas
alternative implementations rely on approximations of limited accuracy (at
least for non-probit links). 

The results (model parameters and likelihoods) are very close to those
obtained by AGQ (\pkg{lme4}) or \pkg{glmmTMB}, indicating a very good quality
the various approximations used. We can also compare the corresponding covariances
<<mtram-toenail-hessian>>=
vcov(toenail_glmer_RI_2)
vcov(toenail_mtram_RI)[1:4, 1:4]
@
which are also in good agreement.

The marginal effects, that is, a marginal binary probit model, are given by
the scaled conditional coefficients
<<mtram-toenail-coef>>=
cf <- coef(toenail_mtram_RI)
cf[2:4] / sqrt(1 + cf["gamma1"]^2)
@
Such marginal effects can be estimated directly by generalised estimation
equations (GEE). For the probit model, three models corresponding to 
different working correlations can be estimated for example by package
\pkg{geepack}:
<<mtram-toenail-gee-probit>>=
library("geepack")
gin <- geeglm(I((0:1)[outcome]) ~ treatment * time, 
              id = patientID, data = toenail, corstr = "independence", 
              family = binomial(link = "probit"))
gex <- geeglm(I((0:1)[outcome]) ~ treatment * time, 
              id = patientID, data = toenail, cor = "exchangeable", 
              family = binomial(link = "probit"))
gun <- geeglm(I((0:1)[outcome]) ~ treatment * time, 
              id = patientID, data = toenail, cor = "unstructured", 
              family = binomial(link = "probit"))
@
The effects are not very close to what we obtained earlier, and it seems the
choice of the working correlations matters here:
<<mtram-toenail-gee-probit-coef>>=
cbind(mtram = cf[2:4] / sqrt(1 + cf["gamma1"]^2),
      indep = coef(gin)[-1],
      excha = coef(gex)[-1],
      unstr = coef(gun)[-1])
@

At least in biostatistics, the probit model is less popular than the logit
model owing to the better interpretability of the fixed effects as
conditional log-odds ratios in the latter. Thus, we replicate the SAS analysis 
reported in Chapter~10 of \cite{Molenberghs_Verbeke_2005}
<<mtram-toenail-gee-logit>>=
gin <- geeglm(I((0:1)[outcome]) ~ treatment * time, 
              id = patientID, data = toenail, corstr = "independence", 
              family = binomial())
gex <- geeglm(I((0:1)[outcome]) ~ treatment * time, 
              id = patientID, data = toenail, cor = "exchangeable", 
              family = binomial())
gun <- geeglm(I((0:1)[outcome]) ~ treatment * time, 
              id = patientID, data = toenail, cor = "unstructured", 
              family = binomial())
@
Again, results are dependent on hyperparameters and also not in very good
agreement with SAS output reported by \cite{Molenberghs_Verbeke_2005}
<<mtram-toenail-gee-logit-coef>>=
coef(gin)
coef(gex)
coef(gun)
@

Alternatively, we can use
the transformation approach to compute marginally interpretable
time-dependent log-odds ratios from random intercept transformation logit
models:% \citep[\code{standardise = TRUE} computes model (M2) instead of the
%default (M1), see][]{Hothorn_2019_mtram}:
<<mtram-toenail_logit, cache = FALSE>>=
m <- ctm(as.basis(~ outcome, data = toenail), 
         shifting = ~ treatment * time, 
         data = toenail, todistr = "Logistic", negative = TRUE)
toenail_logit <- mlt(m, data = toenail, 
                     fixed = c("outcomemoderate or severe" = 0))
toenail_mtram_logit <- mtram(toenail_logit, ~ (1 | patientID), 
                             data = toenail)
@
It is important to note that this model is \emph{not} a logistic 
mixed-effects model and thus we can't expect to obtain identical 
results from \cmd{glmer} as it was (partially) the case for the probit
model. The marginal log-odds ratios are
<<mtram-toenail_logit-coef>>=
cf <- coef(toenail_mtram_logit)
cf[2:4] / sqrt(1 + cf["gamma1"]^2)
@
and an asymptotic confidence interval for the temporal treatment effect can be obtained
from a small simulation
<<mtram-toenail-trt>>=
S <- rmvnorm(10000, mean = coef(toenail_mtram_logit), 
             sigma = vcov(toenail_mtram_logit))
(ci <- quantile(S[,"treatmentterbinafine:time"] / sqrt(1 + S[, "gamma1"]^2), 
                prob = c(.025, .975)))
@
The interval indicates a marginally significant treatment effect, that is,
an odds ratio for none or mild symptoms of $\Sexpr{round(exp(cf["treatmentterbinafine:time"] / sqrt(1 +
cf["gamma1"]^2)), 2)}$ per month, with $95\%$ confidence interval
$(\Sexpr{round(exp(ci), 2)})$.

A direct comparison of the marginal log-odds ratios with GEE results
highlight the discrepancies
<<mtram-toenail-gee-logit-mcoef>>=
cbind(mtram = cf[2:4] / sqrt(1 + cf["gamma1"]^2),
      indep = coef(gin)[-1],
      excha = coef(gex)[-1],
      unstr = coef(gun)[-1])
@
Following \cite{Molenberghs_Verbeke_2005}, we use the GEE with unstructured
working correlation to compute a confidence interval for the temporal
treatment effect on the odds ratio scale
<<mtram-GEE-CI>>=
exp(coef(gun)["treatmentterbinafine:time"] +
    c(-1, 1) * qnorm(.975) * sqrt(diag(vcov(gun)))["treatmentterbinafine:time"])
@
In respect of this temporal treatment effect, GEE and marginal 
transformation models provide similar results, but the ``significance'' of
the temporal treatment effect seems to be affected by numerical issues
arising when fitting such models to this data.

From the marginal transformation model, we can compute and plot marginally interpretable 
probabilities and odds ratios over time
<<mtram-toenail_marginal_logit_s>>=
tmp <- toenail_logit
cf <- coef(tmp)
cf <- cf[names(cf) != "outcomemoderate or severe"]
sdrf <- rev(coef(toenail_mtram_logit))[1]
cf <- coef(toenail_mtram_logit)[names(cf)] / sqrt(sdrf^2 + 1)
cf <- c(cf[1], "outcomemoderate or severe" = 0, cf[-1])
coef(tmp) <- cf
time <- 0:180/10
treatment <- sort(unique(toenail$treatment))
nd <- expand.grid(time = time, treatment = treatment)
nd$prob_logit <- predict(tmp, newdata = nd, type = "distribution")[1,]
nd$odds <- exp(predict(tmp, newdata = nd, type = "trafo")[1,])
@

We can also sample from the distribution of the maximum likelihood
estimators to obtain an idea about the uncertainty
(Figure~\ref{fig:toenailOR}).

\begin{figure}[t]
<<mtram-toenail_OR_2, dev = "png", cache = FALSE, echo = FALSE, dpi = 300>>=
X <- model.matrix(~ treatment * time, data = nd)
rbeta <- rmvnorm(10000, mean = coef(toenail_mtram_logit), 
                 sigma = vcov(toenail_mtram_logit))
s <- rbeta[,ncol(rbeta)]
rbeta <- rbeta[,-ncol(rbeta)] / sqrt(s^2 + 1)
odds <- exp(-X %*% t(rbeta))
OR <- odds[1:length(time),] / odds[-(1:length(time)),]
plot(time, rep(0, length(time)), ylim = range(OR), type = "n", 
     xlab = "Time", ylab = "Odds ratio")
colgrey <- rgb(.1, .1, .1, .01)
out <- apply(OR, 2, function(x) lines(time, x, col = colgrey))
ORest <- nd$odds[1:length(time)] / nd$odds[-(1:length(time))]
lines(time, ORest, col = col[1], lwd = 2)
@
\caption{Toe nail data: Marginal odds ratio over time (from a logistic
  random intercept model). The blue line represents the maximum likelihood
  estimator, the grey lines are samples from the corresponding distribution.
  \label{fig:toenailOR}}
\end{figure}

From the logit and probit models, we can also obtain
marginally interpretable probabilities as (probit)
<<mtram-toenail_marginal_logit>>=
tmp <- toenail_logit
cf <- coef(tmp)
cf <- cf[names(cf) != "outcomemoderate or severe"]
sdrf <- rev(coef(toenail_mtram_logit))[1]
cf <- coef(toenail_mtram_logit)[names(cf)] 
cf <- c(cf[1], "outcomemoderate or severe" = 0, cf[-1])
coef(tmp) <- cf
pr <- predict(tmp, newdata = nd, type = "distribution")[1,]
nd$prob_logit <- pnorm(qnorm(pr) / sdrf)
@
and (logit)
<<mtram-toenail_marginal_probit>>=
tmp <- toenail_probit
cf <- coef(tmp)
cf <- cf[names(cf) != "outcomemoderate or severe"]
sdrf <- rev(coef(toenail_mtram_RI))[1]
cf <- coef(toenail_mtram_RI)[names(cf)] / sqrt(sdrf^2 + 1)
cf <- c(cf[1], "outcomemoderate or severe" = 0, cf[-1])
coef(tmp) <- cf
nd$prob_probit <- predict(tmp, newdata = nd, type = "distribution")[1,]
@
The marginal time-dependent probabilities obtained from all three models are
very similar as shown in Figure~\ref{fig:toenailprob}.

\begin{figure}[t]
<<mtram-toenail_probplot, echo = FALSE>>=
nd2 <- nd[rep(1:nrow(nd), 2),]
nd2$prob <- c(nd$prob_probit, nd$prob_logit)
lev <- c("Probit", "Logit")
nd2$model <- rep(factor(lev, labels = lev, levels = lev), each = nrow(nd))

xyplot(prob ~ time | model, data = nd2, group = treatment, ylim = c(0, 1), 
       xlab = "Time", 
       par.settings = simpleTheme(col = col),
       auto.key = list(text = levels(nd2$treatment), 
                       points = FALSE, lines = TRUE, space = "top"), 
       col = col, type = "l", ylab = "Probability (none or mild)")
@
\caption{Toe nail data: Comparison of marginal probabilities obtained from
         a probit linear mixed-effects model and 
         a logistic transformation
         model with marginal log-odds ratio treatment effect.
         % two logistic transformation
         % models (M2: with or M1: without marginal log-odds ratio treatment
         % effect). 
         \label{fig:toenailprob}}
\end{figure}

\subsection{Random Intercept / Random Slope Models}

Things get a bit less straightforward when a random slope is added to the
model. We switch back to the probit link allowing comparison of our
implementation with other packages. Some implementations do not allow
clusters consisting of a single observation, so we remove patients without
follow-up
<<mtram-toenail-subset>>=
(rlev <- levels(toenail$patientID)[xtabs(~ patientID, 
                                        data = toenail) == 1])
toenail_gr1 <- subset(toenail, !patientID %in% rlev)
toenail_gr1$patientID <- toenail_gr1$patientID[, drop = TRUE]
@

The two implementations of the Laplace approximation in packages
\pkg{lme4}
<<mtram-toenail_glmer_RS, cache = FALSE>>=
toenail_glmer_RS <- 
    glmer(outcome ~ treatment * time + (1 + time | patientID),
          data = toenail_gr1, family = binomial(link = "probit"))
summary(toenail_glmer_RS)
toenail_glmer_RS@theta
@
and \pkg{glmmTMB}
<<mtram-toenail_glmmTMB_RS, cache = FALSE>>=
toenail_glmmTMB_RS_1 <- 
    glmmTMB(outcome ~ treatment * time + (1 + time | patientID),
         data = toenail_gr1, family = binomial(link = "probit"))
summary(toenail_glmmTMB_RS_1)
@
are in good agreement. 

The optimisation of the exact discrete likelihood in the transformation
framework gives
<<mtram-toenail_mtram_RS, cache = FALSE>>=
m <- ctm(as.basis(~ outcome, data = toenail_gr1), 
         shifting = ~ treatment * time, 
         data = toenail, todistr = "Normal", negative = TRUE)
toenail_probit <- mlt(m, data = toenail_gr1, 
                      fixed = c("outcomemoderate or severe" = 0))
toenail_mtram_RS <- 
    mtram(toenail_probit, ~ (1 + time | patientID), 
          data = toenail_gr1)
logLik(toenail_mtram_RS)
coef(toenail_mtram_RS)
@
Here, substantial differences for all parameters can be observed. Because
the parameters have the same meaning in all three implementations, we can
compare the three models in light of the exact discrete log-likelihood
\citep[Equation~6 in][]{Hothorn_2019_mtram} evaluated at these parameters.
The results are given in
Table~\ref{tab:toenail}.  For the random intercept models, AGQ, Laplace, and the
discrete log-likelihood give the same results, the Laplace approximation
seemed to fail.  It was not possible to apply the AGQ approach to
the random intercept / random slope model.  The two implementations of the
Laplace approximation in packages \pkg{lme4} and \pkg{glmmTMB} differed
for the random intercept model but agreed for the random intercept / random
slope model.  The log-likelihood obtained by direct maximisation of
(7) resulted in the best fitting model with the least extreme parameter estimates.  
Computing times for all procedures were comparable.

\begin{table}
\begin{center}
%%%% coefs, logLiks, and timings in table
<<toenail-comparisons, cache = FALSE, echo = FALSE, results = "hide">>=
t1 <- system.time(toenail_glmer_RI_1 <- 
    glmer(outcome ~ treatment * time + (1 | patientID),
          data = toenail, family = binomial(link = "probit"), 
          nAGQ = 1))

t2 <- system.time(toenail_glmer_RI_2 <- 
    glmer(outcome ~ treatment * time + (1 | patientID),
          data = toenail, family = binomial(link = "probit"), 
          nAGQ = 20))

t3 <- system.time(toenail_glmmTMB_RI_3 <- 
    glmmTMB(outcome ~ treatment * time + (1 | patientID),
         data = toenail, family = binomial(link = "probit")))

m <- ctm(as.basis(~ outcome, data = toenail), 
         shifting = ~ treatment * time, 
         data = toenail, todistr = "Normal", negative = TRUE)
toenail_probit <- mlt(m, data = toenail, 
                      fixed = c("outcomemoderate or severe" = 0))
t4 <- system.time(toenail_mtram_RI <- 
    mtram(toenail_probit, ~ (1 | patientID), data = toenail))

t5 <- system.time(toenail_glmer_RS <- 
    glmer(outcome ~ treatment * time + (1 + time | patientID),
          data = toenail_gr1, family = binomial(link = "probit")))

t6 <- system.time(toenail_glmmTMB_RS_1 <- 
    glmmTMB(outcome ~ treatment * time + (1 + time | patientID),
         data = toenail_gr1, family = binomial(link = "probit")))

m <- ctm(as.basis(~ outcome, data = toenail_gr1), 
         shifting = ~ treatment * time, 
         data = toenail, todistr = "Normal", negative = TRUE)
toenail_probit <- mlt(m, data = toenail_gr1, 
                      fixed = c("outcomemoderate or severe" = 0))
t7 <- system.time(toenail_mtram_RS <- 
    mtram(toenail_probit, ~ (1 + time | patientID), 
           data = toenail_gr1))

## ----output, echo = FALSE------------------------------------------------
tn_RI_glmer_L <- c(fixef(toenail_glmer_RI_1), toenail_glmer_RI_1@theta, 0, 0)
tn_RI_glmer_A <- c(fixef(toenail_glmer_RI_2), toenail_glmer_RI_2@theta, 0, 0)
tn_RI_glmmTMB <- c(fixef(toenail_glmmTMB_RI_3)$cond, sqrt(VarCorr(toenail_glmmTMB_RI_3)$cond$patientID), 0, 0)
tn_RI_mlt <- c(coef(toenail_mtram_RI), 0, 0)
tn_RS_glmer <- c(fixef(toenail_glmer_RS), toenail_glmer_RS@theta)
tn_RS_glmmTMB <- c(fixef(toenail_glmer_RS), chol(VarCorr(toenail_glmmTMB_RS_1)$cond$patientID)[c(1,3, 4)])
tn_RS_mlt <- coef(toenail_mtram_RS)
tn <- cbind(tn_RI_glmer_L, tn_RI_glmer_A , tn_RI_glmmTMB, tn_RI_mlt ,
            tn_RS_glmer, tn_RS_glmmTMB, tn_RS_mlt)

logLik(toenail_glmer_RI_1)
logLik(toenail_glmer_RI_2)
logLik(toenail_glmmTMB_RI_3)
logLik(toenail_mtram_RI)

logLik(toenail_glmer_RS)
logLik(toenail_glmmTMB_RS_1)
logLik(toenail_mtram_RS)

ll <- c(
### logLik of transformation model for glmer (Laplace) parameters
logLik(toenail_mtram_RI, tn_RI_glmer_L[1:5] * c(-1, 1, 1, 1, 1)),
### logLik of transformation model for glmer (AGQ) parameters
logLik(toenail_mtram_RI, tn_RI_glmer_A[1:5] * c(-1, 1, 1, 1, 1)),
### logLik of transformation model for glmmTMB (Laplace) parameters
logLik(toenail_mtram_RI, tn_RI_glmmTMB[1:5] * c(-1, 1, 1, 1, 1)),
### logLik of transformation model
logLik(toenail_mtram_RI),
### logLik of transformation model for glmer (Laplace) parameters
logLik(toenail_mtram_RS, tn_RS_glmer * c(-1, rep(1, 6))),
### logLik of transformation model for glmmTMB (Laplace) parameters
logLik(toenail_mtram_RS, tn_RS_glmmTMB * c(-1, rep(1, 6))),
### logLik of transformation model
logLik(toenail_mtram_RS))

tm <- c(t1["user.self"],
        t2["user.self"],
        t3["user.self"],
        t4["user.self"],
        t5["user.self"],
        t6["user.self"],
        t7["user.self"])
tm <- formatC(tm, format = "f", digits = 2, width = 5)

tn <- formatC(tn, format = "f", digits = 2, width = 5)
ll <- formatC(ll, format = "f", digits = 2, width = 6)
tn <- cbind(c("$\\alpha$", "$\\eshiftparm_1$", "$\\eshiftparm_2$", "$\\eshiftparm_3$", "$\\gamma_1$", "$\\gamma_2$", "$\\gamma_3$"), tn)
ret <- c("
\\begin{tabular}{lrrrr|rrr} \\\\ \\hline
& \\multicolumn{4}{c|}{RI} & \\multicolumn{3}{c}{RI + RS} \\\\
& \\texttt{glmer} & \\texttt{glmer} & \\texttt{glmmTMB} &  & \\texttt{glmer} & \\texttt{glmmTMB} & \\\\
& L               & AGQ             & L & (7) & L & L & (7) \\\\ \\hline")
ret <- c(ret, apply(tn, 1, function(x) c(paste(x, collapse = " & "), "\\\\")))
ret <- c(ret, "\\hline")
ret <- c(ret, 
         paste("LogLik &", paste(ll, collapse = "&"), "\\\\ "), 
         paste("Time (sec)   &", paste(tm, collapse = "&"), "\\\\ \\hline"), 
         "\\end{tabular}")
@
<<table, echo = FALSE, results = "asis">>=
cat(ret, sep = "\n")
@
\caption{Toe nail data. Binary probit models featuring fixed intercepts
$\alpha$, treatment effects $\eshiftparm_1$, time effects $\eshiftparm_2$,
and time-treatment interactions $\eshiftparm_3$ are compared.
Random intercept (RI) and
random intercept/random slope (RI + RS) models were estimated by the Laplace (L)
and Adaptive Gauss-Hermite Quadrature (AGQ) approximations to the likelihood (implemented in packages
\pkg{lme4} and \pkg{glmmTMB}). In addition, the exact discrete
log-likelihood (7) was used for model fitting and evaluation (the
in-sample log-likelihood (7) for all models and timings
of all procedures are given in the last two lines).
\label{tab:toenail}
}
\end{center}
\end{table}


\section{Proportional Odds Models for Bounded Responses} \label{sec:logit}

\cite{manuguerra_heller_2010} proposed a mixed-effects model for bounded
responses  whose fixed effects can be interpreted as log-odds ratios. 
We fit a transformation model to data from
a randomised controlled trial on chronic neck pain treatment
\citep{chow_heller_2006}. The data are visualised in
Figure~\ref{fig:neck_pain}. Subjective neck pain levels were assessed on a 
visual analog scale, that is, on a bounded interval.

\begin{figure}[t]
<<mtram-neck_plot, echo = FALSE, fig.height = 4, fig.width = 7>>=
data("neck_pain", package = "ordinalCont")
pain_df <- neck_pain
idt <- xtabs(~ id, data = pain_df)
miss <- names(idt)[idt < 3]
pain_df <- subset(pain_df, !id %in% miss)
pain_df$id <- factor(pain_df$id)
levels(pain_df$laser) <- c("Placebo", "Active")
levels(pain_df$time) <- c("Baseline", "7 weeks", "12 weeks")
pain <- rbind(subset(pain_df, laser == levels(pain_df$laser)[1]),
              subset(pain_df, laser == levels(pain_df$laser)[2]))
p1 <- xyplot(vas ~ time | laser, data = pain, 
       groups = id, type = "l", col = rgb(.1, .1, .1, .1),
       lwd = 2, layout = c(2, 1),
       ylab = "Neck pain (on visual analog scale)", xlab = "Examinations")
plot(p1)
@
\caption{Neck pain: Trajectories of neck pain assessed on a visual analog
         scale with and without low-level laser therapy.
         \label{fig:neck_pain}}
\end{figure}

\cite{manuguerra_heller_2010} suggested the conditional model
\begin{eqnarray*}
& & \logit(\Prob(\text{pain} \le \ry \mid \text{treatment}, \text{time}, i)) =
\\
& &  \quad \h(\ry) + \eshiftparm_\text{Active} + \eshiftparm_\text{7 weeks} + 
  \eshiftparm_\text{12 weeks} + \eshiftparm_\text{7 weeks, Active} + 
  \eshiftparm_\text{12 weeks, Active} + \alpha_i
\end{eqnarray*}
with random intercepts $\tilde{\alpha}_i$ such that the odds at baseline, for example, are given by
\begin{eqnarray*}
\frac{\Prob(\text{pain} \le \ry \mid \text{Active}, \text{baseline}, i)}
     {\Prob(\text{pain} > \ry \mid \text{Active}, \text{baseline}, i)} = 
\exp(\eshiftparm_\text{Active}) 
\frac{\Prob(\text{pain} \le \ry \mid \text{Placebo}, \text{baseline}, i)}
     {\Prob(\text{pain} > \ry \mid \text{Placebo}, \text{baseline}, i)}
\end{eqnarray*}

<<mtram-ordinalCont>>=
library("ordinalCont")
@
<<mtram-neck_ocm, cache = FALSE, results = "hide">>=
neck_ocm <- ocm(vas ~ laser * time + (1 | id), data = pain_df, 
                scale = c(0, 1))
@
The results
<<mtram-neck_ocm_summary>>=
summary(neck_ocm)
@
suggest that there is a difference at baseline; the pain distribution of 
subjects in the placebo group on the odds scale is only 
$\Sexpr{round(exp(coef(neck_ocm)[2]) * 100, 1)}\%$ of the odds in the active 
group for any cut-off $\ry$:
<<mtram-neck_ocm_ci>>=
exp(cbind(coef(neck_ocm)[2:6], confint(neck_ocm)[2:6,]))
@
In contrast, there seems to be a very large treatment effect (at week 7, the
odds in the placebo group is $\Sexpr{round(exp(coef(neck_ocm)[3]), 2)}$ times
larger than in the active group. This levels off after 12 weeks, but the
effect is still significant at the $5\%$ level.

For comparison, we can fit a conditional mixed-effects transformation model with a different
parametrisation of the transformation function $\h$ using a Laplace
approximation of the likelihood \citep{Tamasi_Crowther_Puhan_2022}:
<<tramME-neck>>=
library("tramME")
neck_ColrME <- ColrME(vas ~ laser * time + (1 | id), data = pain_df, 
                      bounds = c(0, 1), support = c(0, 1))
@
and coefficients
<<tramME-neck_ci>>=
exp(coef(neck_ColrME))
@
The model is the same as \code{neck\_ocm}, but the parameter estimates for
log-odds ratios differ quite substantially due to an alternative
parameterisation of $\h$ and due to different estimation procedures being
applied.

Our marginally interpretable transformation model with the same
transformation function as the model \code{neck\_ColrME} but with a completely 
different model formulation and optimisation procedure for maximising the log-likelihood, 
can be estimated by
<<mtram-neck_Colr, cache = FALSE>>=
neck_Colr <- Colr(vas ~ laser * time, data = pain_df, 
                  bounds = c(0, 1), support = c(0, 1),
                  extrapolate = TRUE)
neck_Colrmer <- mtram(neck_Colr, ~ (1 | id), data = pain_df)
@
Based on this model, it is possible to derive the marginal 
distribution functions in the two groups, see Figure~\ref{fig:distr_pain}.

\begin{figure}[t]
<<mtram-neck_Colr_distr, echo = FALSE, fig.height = 4, fig.width = 7>>=
nd <- expand.grid(laser = unique(pain_df$laser),
                  time = unique(pain_df$time))
tmp <- as.mlt(neck_Colr)
coef(tmp)[] <- coef(neck_Colrmer)[1:12]
q <- 1:99/100
nd2 <- expand.grid(vas = q, laser = unique(pain_df$laser),
                   time = unique(pain_df$time))
# tmp <- as.mlt(neck_Colr) 
sd <- sqrt(coef(neck_Colrmer)[13]^2 + 1)
prb <- predict(tmp, newdata = nd, type = "distribution", q = q)
nd2$prob <- c(pnorm(qnorm(prb) / sd))
p2 <- xyplot(prob ~ vas | time, data = nd2, groups = laser, type = "l", 
             col = col, 
             layout = c(3, 1),
             xlab = "Neck pain (on visual analog scale)", 
             ylab = "Marginal distribution", 
             par.settings = simpleTheme(col=col),
             auto.key = list(text = levels(nd2$laser), 
                             points = FALSE, lines = TRUE, space = "top"))
plot(p2)

## M1
# neck_Colrmer <- mtram(neck_Colr, ~ (1 | id), data = pain_df, 
#                       Hessian = TRUE, standardise = FALSE)
# logLik(neck_Colrmer)
# 
# nd <- expand.grid(laser = unique(pain_df$laser),
#                   time = unique(pain_df$time))
# q <- 1:99/100
# nd2 <- expand.grid(vas = q, laser = unique(pain_df$laser),
#                    time = unique(pain_df$time))
# tmp <- as.mlt(neck_Colr)
# coef(tmp)[] <- coef(neck_Colrmer)[1:12]
# sd <- sqrt(coef(neck_Colrmer)[13]^2 + 1)
# prb <- predict(tmp, newdata = nd, type = "distribution", q = q)
# nd2$prob <- c(pnorm(qnorm(prb) / sd))
# p2 <- xyplot(prob ~ vas | time, data = nd2, groups = laser, type = "l", 
#              col = col, ylim = c(-0.05, 1.05),
#              layout = c(3, 1),
#              xlab = "Neck pain (on visual analog scale)", 
#              ylab = "Marginal distribution", 
#              par.settings = simpleTheme(col=col),
#              auto.key = list(text = levels(nd2$laser), 
#                              points = FALSE, lines = TRUE, space = "top"))
# plot(p2)
@
\caption{Neck pain: Marginal distribution functions of chronic neck pain
         evaluated at three different time points under placebo or active
         low-level laser therapy. \label{fig:distr_pain}}
\end{figure}
We sample from the joint normal distribution of the maximum likelihood estimators
$\hat{\eparm}_1, \dots, \hat{\eparm}_7$, $\hat{\eshiftparm}_\text{Active},
\hat{\eshiftparm}_\text{7 weeks}, \hat{\eshiftparm}_\text{12 weeks},
\hat{\eshiftparm}_\text{7 weeks, Active},
\hat{\eshiftparm}_\text{12 weeks, Active}, \hat{\alpha}_i$
and compute confidence intervals for the marginal treatment
effect after 7 and 12 weeks
<<mtram-neck_Colr-CI, echo = TRUE, eval=TRUE>>=
S <- vcov(neck_Colrmer)
rbeta <- rmvnorm(10000, mean = coef(neck_Colrmer), sigma = S)
s <- rbeta[, ncol(rbeta)]
rbeta <- rbeta[,-ncol(rbeta)] / sqrt(s^2 + 1)
t(apply(rbeta[, 8:12], 2, function(x) {
  quantile(exp(x),prob = c(.025, .5, .975))}))
@

Because the model \code{neck_Colrmer} has a marginal interpretation, we can
derive the marginal probabilistic index and corresponding confidence intervals 
for the three time points as follows. In this case, the marginal probabilistic
index obtained from model \code{neck\_Colrmer} is the probability that,
for a randomly selected patient in the treatment group, the neck pain score at 
time $t$ is higher than the score for a subject in the placebo group randomly 
selected at the same time point. 

There are two possible ways to compute the marginal probabilistic index.
First, we consider the standardised version of the marginal treatment effects,
that is:
<<mtram-neck_Colr-std_beta>>=
beta <- coef(neck_Colrmer)[8:12]
alpha <- coef(neck_Colrmer)[13]
(std_beta <- cbind(beta / sqrt(1 + alpha^2)))
@
Then we compute the marginal treatment effect for weeks $0, 7, 12$ by multiplying
the shift vector with the following contrast matrix
<<mtram-neck_Colr-ctr_mat>>=
ctr_mat <- matrix(c(1, 0, 0, 0, 0,
                    1, 0, 0, 1, 0,
                    1, 0, 0, 0, 1), nrow = 3, byrow = TRUE)
ctr_mat %*% std_beta
@
We simulate from the asymptotic distribution of the parameters to obtain an
empirical 95\% confidence interval and pass it to the \code{PI} function by
specifying the correct link function
<<mtram-neck_PImanual, eval=TRUE>>=
(ci_emp <- t(apply(ctr_mat %*% t(rbeta[, 8:12]), 1, function(x) {
  quantile(x, prob = c(.025, .5, .975))})))

PI(-ci_emp, link = "logistic")
@
Alternatively, we can compute the probabilistic index by passing a \code{Colr}
model to the \code{PI} function. However, we have to make sure that the marginal
model has the correct coefficients as obtained by standardising the coefficients
from the \code{mtram} model:
<<neck_Colr-PI, echo = TRUE>>=
nd <- expand.grid(time = unique(pain_df$time),
                  laser = unique(pain_df$laser))
neck_Colr_marg <- neck_Colr
neck_Colr_marg$coef <- coef(neck_Colrmer)[1:12] / 
                       sqrt(coef(neck_Colrmer)[13]^2 + 1)
(neck_Colr_PI <- PI(neck_Colr_marg, newdata = nd[1:3, ], 
                    reference = nd[4:6, ],
                    one2one = TRUE, conf.level = .95))[1:3, 1:3]
@
At baseline, we obtain a probabilistic index of $\Sexpr{round(neck_Colr_PI[1, 1], 2)}$.
After 7 weeks, its value is $\Sexpr{round(neck_Colr_PI[2, 1], 2)}$ and after 12 weeks
$\Sexpr{round(neck_Colr_PI[3, 1], 2)}$. These values reflect the effect of the
low-level laser therapy for patients in the treatment group.

Of course, the confidence intervals for the estimates of the probabilistic index
differ slightly across the two methods, but the point estimates coincide.


\section{Marginally Interpretable Weibull and Cox Models}

The CAO/ARO/AIO-04 randomised clinical trial
\citep{Roedel_Graeven_Fietkau_2015} compared Oxaliplatin added to
fluorouracil-based preoperative chemoradiotherapy and postoperative
chemotherapy to the same therapy using fluorouracil only for rectal cancer
patients.  Patients were
randomised in the two treatment arms by block randomisation taking the study
center, the lymph node involvement (negative, positive), and tumour grading
(T1-3 vs.~T4) into account.  The primary endpoint was disease-free survival,
defined as the time between randomisation and non-radical surgery of the
primary tumour (R2 resection), locoregional recurrence after R0/1 resection,
metastatic disease or progression, or death from any cause, whichever
occurred first. The observed outcomes are a mix of exact dates (time to
death or incomplete removal of the primary tumour), right-censoring (end of
follow-up or drop-out), and interval-censoring (local or distant
metastases). We are interested in a clustered Cox or Weibull model for 
interval-censored survival times. The survivor functions, estimated
separately for each of the four strata defined by lymph node involvement and
tumour grading, are given in Figure~\ref{fig:CAO}.

<<mtram-CAO, echo = FALSE>>=
dir <- system.file("rda", package = "TH.data")
load(file.path(dir, "Primary_endpoint_data.rda"))
@

\begin{figure}[t]
<<mtram-CAO-plot, cache = FALSE, echo = FALSE>>=
ra <- sort(unique(CAOsurv$randarm))
st <- sort(unique(CAOsurv$strat_t))
sn <- sort(unique(CAOsurv$strat_n))
su <- c(1, 1700)
add <- c(0,  max(CAOsurv$iDFS[, "time2"]) - su[2])
ylim <- c(-.05, 1.05)
tmp <- as.mlt(Coxph(iDFS | 0 + strat_n:strat_t:randarm ~ 1, data = CAOsurv, 
                    support = su, add = add, log_first = TRUE))
nd <- expand.grid(strat_n = sn, strat_t = st, randarm = ra)
q <- mkgrid(tmp, 100)[[1]]
surv <- predict(tmp, newdata = nd, type = "survivor", q = q)
nd <- nd[rep(1:nrow(nd), each = nrow(surv)),]
nd$time <- q
nd$surv <- c(surv)
xyplot(surv ~ time | strat_t + strat_n, data = nd, groups = randarm, 
       type = "l", ylim = c(0, 1), col = col, ylab = "Probability",
       xlab = "Time (in days)",
       par.settings = simpleTheme(col=col),
       auto.key = list(text = levels(nd$randarm), 
                       points = FALSE, lines = TRUE, space = "top"))
@
\caption{Rectal cancer: Distribution of disease-free survival times for two
         treatments in the four strata defined by lymph node involvement
         (negative or positive) and tumor grading (T1-3 or T4). \label{fig:CAO}}
\end{figure}

The implementation of marginally interpretable
linear transformation models is currently not able to
deal with mixed exact and censored outcomes in the same cluster. We
therefore recode exact event times as being interval-censored by adding a
4-day window to each exact event time (variable \code{iDFS2}).

<<mtram-CAO_DFS>>=
### convert "exact" event dates to interval-censoring (+/- one day)
tmp <- CAOsurv$iDFS
exact <- tmp[,3] == 1 
tmp[exact,2] <- tmp[exact,1] + 2
tmp[exact,1] <- pmax(tmp[exact,1] - 2, 0)
tmp[exact,3] <- 3
CAOsurv$iDFS2 <- tmp
@

We start with the random intercept model
\begin{eqnarray*}
\Prob(\rY > \ry \mid \text{treatment}) = 
\exp\left(-\exp\left(\frac{\eparm_1 + \eparm_2 \log(\ry) - 
                     \eshiftparm_\text{5-FU + Ox}}{\sqrt{\gamma_1^2 + 1}}\right)\right)
\end{eqnarray*}
assuming a marginal Weibull model whose effects are scaled depending on the
variance $\gamma_1^2$ of a block-specific (interaction of lymph node involvement,
tumor grading, and study center) random intercept:
<<mtram-CAO_SR, cache = TRUE>>=
CAO_SR <- Survreg(iDFS2 ~ randarm, data = CAOsurv)
CAO_SR_mtram <- mtram(CAO_SR, ~ (1 | Block), data = CAOsurv)
logLik(CAO_SR_mtram)
(cf <- coef(CAO_SR_mtram))
(OR <- exp(-cf["randarm5-FU + Oxaliplatin"] / sqrt(cf["gamma1"]^2 + 1)))
@
We are, of course, interested in the marginal treatment effect, that is, the
hazards ratio 
%
\begin{eqnarray*}
\exp\left(-\eshiftparm_\text{5-FU + Ox} / \sqrt{\gamma_1^2 + 1}\right).
\end{eqnarray*}
%
We simply sample from the joint normal distribution of the maximum likelihood estimators
$\hat{\eparm}_1, \hat{\eparm}_2, \hat{\eshiftparm}_\text{5-FU + Ox},
\hat{\gamma}_1$ and compute confidence intervals for the marginal treatment
effect $\Sexpr{round(OR, 2)}$ as
<<mtram-CAO-CI>>=
S <- vcov(CAO_SR_mtram)
# sqrt(diag(S))
rbeta <- rmvnorm(10000, mean = coef(CAO_SR_mtram), 
                 sigma = S)
s <- rbeta[, ncol(rbeta)]
rbeta <- rbeta[, -ncol(rbeta)] / sqrt(s^2 + 1)
quantile(exp(-rbeta[, ncol(rbeta)]), prob = c(.025, .5, .975))
@

In a next step, we stratify with respect to lymph node involvement and tumor
grading: For each of the four strata, the parameters $\eparm_1$ and
$\eparm_2$ are estimated separately:
<<mtram-CAO_SR_2, cache = TRUE>>=
CAO_SR_2 <- Survreg(iDFS2 | 0 + strat_n:strat_t ~ randarm, data = CAOsurv)
CAO_SR_2_mtram <- mtram(CAO_SR_2, ~ (1 | Block), data = CAOsurv)
logLik(CAO_SR_2_mtram)
(cf <- coef(CAO_SR_2_mtram))
(OR_2 <- exp(-cf["randarm5-FU + Oxaliplatin"] / sqrt(cf["gamma1"]^2 + 1)))
@
The corresponding confidence interval for the marginal treatment effect is
then
<<mtram-CAO-CI-2, echo = FALSE>>=
S <- vcov(CAO_SR_2_mtram)
rbeta <- rmvnorm(10000, mean = coef(CAO_SR_2_mtram), 
                 sigma = S)
s <- rbeta[, ncol(rbeta)]
rbeta <- rbeta[, -ncol(rbeta)] / sqrt(s^2 + 1)
quantile(exp(-rbeta[, ncol(rbeta)]), prob = c(.025, .5, .975))
@
We now relax the Weibull assumption in the Cox model
\begin{eqnarray*}
\Prob(\rY > \ry \mid \text{treatment}) = 
\exp\left(-\exp\left(\frac{\basisy(\log(\ry))^\top \parm + 
                     \eshiftparm_\text{5-FU + Ox}}{\sqrt{\gamma_1^2 + 1}}\right)\right)
\end{eqnarray*}
(note the positive sign of the treatment effect).
<<mtram-CAO_Cox_2, cache = TRUE>>=
CAO_Cox_2 <- Coxph(iDFS2 | 0 + strat_n:strat_t ~ randarm, data = CAOsurv, 
                   support = c(1, 1700), log_first = TRUE, order = 4)
logLik(CAO_Cox_2)
CAO_Cox_2_mtram <- mtram(CAO_Cox_2, ~ (1 | Block), data = CAOsurv)
logLik(CAO_Cox_2_mtram)
coef(CAO_Cox_2_mtram)
@
with confidence interval
<<mtram-CAO-CI-3, echo = FALSE>>=
S <- vcov(CAO_Cox_2_mtram)
rbeta <- rmvnorm(10000, mean = coef(CAO_Cox_2_mtram), 
                 sigma = S)
s <- rbeta[,ncol(rbeta)]
rbeta <- rbeta[,-ncol(rbeta)] / sqrt(s^2 + 1)
quantile(exp(rbeta[, ncol(rbeta)]), prob = c(.025, .5, .975))
@
For the marginally interpretable models that can be derived from model
\code{CAO\_Cox\_2\_mtram} we can compute the probabilistic index.
This value is the meaning that over all study centers, a randomly selected patient 
receiving Oxaliplatin has a $56\%$ probability of staying disease-free
longer than a randomly
selected patient receiving the standard treatment only, given that they both 
have the same lymph node state and tumor grading.
<<mtram-CAO_PI>>=
nd <- CAOsurv[1:2, ]
tmp <- CAO_Cox_2
tmp$coef <- coef(CAO_Cox_2_mtram)[-22] / sqrt(coef(CAO_Cox_2_mtram)[22]^2 + 1)
(CAO_Cox_PI <- PI(tmp, newdata = nd[2, ], reference = nd[1, ],
                  one2one = TRUE, conf.level = .95))[1, ]
@
but we can compute the same manually as follows:
<<mtram-CAO_PI_man>>=
ci_man <- quantile(-rbeta[, ncol(rbeta)], prob = c(.025, .5, .975))
(CAO_Cox_PIm <- PI(ci_man, link = "minimum extreme value"))
@
We can fit mixed-effects transformation models 
\citep{tamasi2021tramme,Tamasi_Crowther_Puhan_2022} as follows:
<<tramME-CAO_SR, cache = TRUE>>=
CAO_Cox_2_tramME <- CoxphME(iDFS2 | 0 + strat_n:strat_t ~ randarm + (1 | Block), 
                            data = CAOsurv, log_first = TRUE)
@
From this conditional model, we can obtain the conditional hazard ratio with
confidence interval:
<<tramME-CAO_SR-hr, cache = TRUE>>=
exp(coef(CAO_Cox_2_tramME))
exp(confint(CAO_Cox_2_tramME, parm = "randarm5-FU + Oxaliplatin", 
            estimate = TRUE))
@
which is similar to the one of the marginally interpretable model.

% For both the mixed-effects transformation model and the marginally interpretable
% transformation model, the estimated variance parameter $\gamma_1$ is not very large,
% meaning that
<<echo=FALSE, eval=FALSE>>=
sqrt(VarCorr(CAO_Cox_2_tramME)$Block$var)
coef(CAO_Cox_2_mtram)["gamma1"]
@


% Because the estimated variance parameter $\gamma_1$ is not very large, we
% would expect to see similar results in a conditional Cox model with normal
% frailty term
<<mtram-CAO-coxme, echo = FALSE, eval = FALSE>>=
library("coxme")
m <- coxme(DFS ~ randarm + (1 | Block), data = CAOsurv)
summary(m)
sd <- sqrt(diag(vcov(m)))
exp(coef(m) + c(-1, 0, 1) * qnorm(.975) * sd)
@

\bibliography{mlt,packages}

\newpage

\appendix

\section{Simulations}

Empirical results presented in Section 4 of \cite{Hothorn_2019_mtram} can be
reproduced using
<<sim, eval = FALSE>>=
source(system.file("simulations", "mtram_sim.R", package = "tram"), echo = TRUE)
@
(this takes quite some time).

\section{Change Log}

\begin{description}
\item[1.2-0] \pkg{tram} utilises novel log-likelihood
(\code{mvtnorm::lpRR()}) and score functions (\code{mvtnorm::slpRR()}) for
reduced rank problems introduced in \pkg{mvtnorm} 1.3-2. The default
optimiser is now \code{auglag::auglag()} with numerically approximated
Hessians. Score functions are now available for both continuous and 
discrete response (but there is little hope that score functions 
for a mix of continuous and discrete responses will become available). Model
fit is now faster and more accurate for models with discrete responses.
Some parameter estimates and standard errors changed slightly, however,
model interpretation was not affected.
\end{description}

\newpage

<<mtram-sessionInfo, echo = FALSE, results = "hide">>=
sessionInfo()
@

<<mtram-funs, echo = FALSE, results = "hide">>=
if (file.exists("packages.bib")) file.remove("packages.bib")
pkgversion <- function(pkg) {
    pkgbib(pkg)
    packageDescription(pkg)$Version
}
pkgbib <- function(pkg) {
    x <- citation(package = pkg, auto = TRUE)[[1]]
    b <- toBibtex(x)
    b <- gsub("Buehlmann", "B{\\\\\"u}hlmann", b)
    b[1] <- paste("@Manual{pkg:", pkg, ",", sep = "")
    if (is.na(b["url"])) {
        b[length(b)] <- paste("   URL = {http://CRAN.R-project.org/package=",
                              pkg, "}", sep = "")
        b <- c(b, "}")
    }
    cat(b, sep = "\n", file = "packages.bib", append = TRUE)
}

pkg <- function(pkg) {
    vrs <- try(pkgversion(pkg))
    if (inherits(vrs, "try-error")) return(NA)
    paste("\\\\pkg{", pkg, "} \\\\citep[version~",
          vrs, ",][]{pkg:", pkg, "}", sep = "")
}

pkg("mlt")
pkg("tram")
pkg("SparseGrid")
cat(c("@Manual{vign:mlt.docreg,",
             "    title = {Most Likely Transformations: The mlt Package},",
             "    author = {Torsten Hothorn},",
             paste("    year = ", substr(packageDescription("mlt.docreg")$Date, 1, 4), ",", sep = ""),
             paste("    note = {R package vignette version ", packageDescription("mlt.docreg")$Version, "},", sep = ""),
             "    url = {https://CRAN.R-project.org/package=mlt.docreg},",
             "}"), file = "packages.bib", append = TRUE, sep = "\n")
@

\end{document}