\documentclass[nojss]{jss} \usepackage{amsmath,amsfonts,amsthm,enumerate,bm} \newcommand{\R}{\mathbb{R}} \renewcommand{\S}{\mathbb{S}} \newcommand{\pFq}[2]{{}_{#1}F_{#2}} \newcommand{\Ies}{I^\vee} \newcommand{\norm}[1]{\lVert#1\rVert} \newcommand{\argmax}{\mathop{\mathrm{arg max}}} \newcommand{\fct}[1]{{\texttt{#1()}\index{#1@\texttt{#1()}}}} \newenvironment{FIXME}{ \begin{quote}\strut\marginpar{FIXME}}{\end{quote}} %% \usepackage{Sweave} \SweaveOpts{eps=false, keep.source=true, width=8, height=4} \setkeys{Gin}{width=\textwidth} \author{Kurt Hornik\\WU Wirtschaftsuniversit\"at Wien \And Bettina Gr\"un\\WU Wirtschaftsuniversit\"at Wien} \Plainauthor{Kurt Hornik, Bettina Gr\"un} \title{\pkg{movMF}: An \proglang{R} Package for Fitting Mixtures of von Mises-Fisher Distributions} \Plaintitle{movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions} \Keywords{EM algorithm, finite mixture, hypergeometric function $\pFq{0}{1}$, modified Bessel function ratio, \proglang{R}, von Mises-Fisher distribution} \Plainkeywords{EM algorithm, finite mixture, hypergeometric function 0F1, modified Bessel function ratio, R, von Mises-Fisher distribution} \Abstract{ This article is a (slightly) modified and shortened version of \cite{vMF:Hornik+Gruen:2014b}, published in the \emph{Journal of Statistical Software}. Finite mixtures of von Mises-Fisher distributions allow to apply model-based clustering methods to data which is of standardized length, i.e., all data points lie on the unit sphere. The \proglang{R} package \pkg{movMF} contains functionality to draw samples from finite mixtures of von Mises-Fisher distributions and to fit these models using the expectation-maximization algorithm for maximum likelihood estimation. Special features are the possibility to use sparse matrix representations for the input data, different variants of the expectation-maximization algorithm, different methods for determining the concentration parameters in the M-step and to impose constraints on the concentration parameters over the components. In this paper we describe the main fitting function of the package and illustrate its application. We also discuss he resolution of several numerical issues which occur for estimating the concentration parameters and for determining the normalizing constant of the von Mises-Fisher distribution. } \Address{ Kurt Hornik\\ Institute for Statistics and Mathematics\\ WU Wirtschaftsuniversit\"at Wien\\ Welthandelsplatz 1\\ 1020 Wien, Austria\\ E-mail: \email{Kurt.Hornik@R-project.org}\\ URL: \url{https://statmath.wu.ac.at/~hornik/} Bettina Gr\"un\\ Institute for Statistics and Mathematics\\ Wirtschaftsuniversit{\"a}t Wien\\ Welthandelsplatz 1\\ 1020 Wien, Austria\\ E-mail: \email{Bettina.Gruen@R-project.org}\\ } <>= set.seed(1234) options(width=65, prompt = "R> ", continue = "+ ", useFancyQuotes = FALSE) cache <- TRUE library("slam") library("lattice") library("movMF") palette(colorspace::heat_hcl(4)) ltheme <- canonical.theme("pdf", FALSE) for (i in grep("padding", names(ltheme$layout.heights))) { ltheme$layout.heights[[i]] <- 0.2 } for (i in grep("padding", names(ltheme$layout.widths))) { ltheme$layout.widths[[i]] <- 0 } lattice.options(default.theme = ltheme) @ <>= rotationMatrix <- function(mu) { beta <- asin(mu[1]) alpha <- atan( - mu[2] / mu[3]) + sign(mu[2] * mu[3]) * (mu[3] < 0) * pi cosa <- cos(alpha); cosb <- cos(beta) sina <- sin(alpha); sinb <- sin(beta) matrix(c(cosb, sina * sinb, - cosa * sinb, 0, cosa, sina, sinb, - sina * cosb, cosa * cosb), nrow = 3) } getIsolines <- function(d, mu, length = 100) { theta <- seq(0, 2*pi, length = length) sinphi <- sin(acos(d)) tcrossprod(cbind(cos(theta) * sinphi, sin(theta) * sinphi, d), rotationMatrix(mu)) } plotGlobe <- function(x, class, pars = NULL, main = "", theta = 0, phi = 0, Q = c(0.5, 0.95), n = 100000, xlim = c(-1, 1), ylim = c(-1, 1), zlim = c(-1, 1), ...) { pmat <- persp(0:1, 0:1, matrix(, 2, 2), theta = theta, phi = phi, xlim = xlim, ylim = ylim, zlim = zlim, scale = FALSE, xlab="x", ylab="y", zlab="z", main = main, ...) trans3d <- function(x, y, z) { tr <- cbind(x, y, z, 1) %*% pmat list(x = tr[, 1]/tr[, 4], y = tr[, 2]/tr[, 4]) } x <- x / row_norms(x) x3D <- trans3d(x[,1], x[,2], x[,3]) theta <- seq(0, 2*pi, length = 41) phi <- seq(0, pi/2, length = 21) x <- cos(theta) %o% sin(phi) y <- sin(theta) %o% sin(phi) z <- rep(1, length(theta)) %o% cos(phi) for (j in seq(phi)[-1]) for (i in seq(theta)[-1]) { idx <- rbind(c(i-1,j-1), c(i,j-1), c(i,j), c(i-1,j)) polygon(trans3d(x[idx], y[idx], z[idx]), border = "grey") } points(x3D$x, x3D$y, col = class, pch = 20) if (!is.null(pars)) { kappa <- row_norms(pars) d <- sapply(kappa, function(K) rmovMF(n, K * c(1, 0, 0))[,1]) mu <- pars / kappa for (i in seq_along(Q)) { M <- apply(d, 2, quantile, 1-Q[i]) isos <- lapply(seq_len(nrow(mu)), function(i) getIsolines(M[i], mu[i,])) isostrans <- lapply(isos, function(x) trans3d(x[,1], x[,2], x[,3])) lapply(seq_along(isostrans), function(j) polygon(isostrans[[j]]$x, isostrans[[j]]$y, border = j, lwd = 2, lty = i)) } mu3D <- trans3d(mu[,1], mu[,2], mu[,3]) points(mu3D$x, mu3D$y, pch = 4, lwd = 2, col = seq_len(nrow(mu))) } invisible(pmat) } @ <>= data("useR_2008_abstracts", package = "movMF") @ \begin{document} \sloppy %\VignetteIndexEntry{movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions} %\VignetteDepends{} %\VignetteKeywords{EM algorithm, finite mixture, R, von Mises-Fisher distribution} %\VignettePackage{movMF} %%------------------------------------------------------------------------ %%------------------------------------------------------------------------ \section{Introduction}\label{sec:introduction} Finite mixture models allow to cluster observations by assuming that for each component a suitable parametric distribution can be specified and that the mixture distribution is derived by convex combination of the component distributions. \citet{vMF:McLachlan+Peel:2000} and \citet{vMF:Fruehwirth-Schnatter:2006} present overviews on the estimation of these models in a maximum likelihood as well as a Bayesian setting together with different applications of finite mixture models. If the support of the data is given by the unit sphere, a natural choice for the component distributions is the von Mises-Fisher (vMF) distribution. The special case where data is in $\mathbb{R}^2$, i.e., the observations lie on the unit circle, is referred to as von Mises distribution. The vMF distribution has concentric contour lines similar to the multivariate normal distribution with the variance-covariance matrix being a multiple of the identity matrix. In fact \citet[][page~173]{vMF:Mardia+Jupp:1999} point out that if $X$ follows a multivariate normal distribution with mean parameter $\mu$ which has length one and variance-covariance matrix $\kappa^{-1}I$, then the conditional distribution of $X$ given that it has length one is a vMF distribution with mean direction parameter~$\mu$ and concentration parameter~$\kappa$. Finite mixtures of vMF distributions were introduced in \citet{vMF:Banerjee+Dhillon+Ghosh:2005} to cluster data on the unit sphere. They propose to use the expectation-maximization (EM) algorithm for maximum likelihood (ML) estimation and present two variants which they refer to as hard and soft clustering. The areas of application of the presented examples include text mining where abstracts of scientific journals, news articles and newsgroup messages are categorized, and bioinformatics using a yeast gene expression data set. In these applications, available data is typically high-dimensional. The estimation methods employed need to take this into account. For finite mixtures of vMF distributions this is particularly relevant for the determination of the concentration parameters. \citet{vMF:Tang+Chu+Huang:2009} use finite mixtures of vMF distributions for speaker clustering. The data is pre-processed by fitting a Gaussian mixture model (GMM) to the utterances and stacking the mean vectors of the mixture components to form the GMM mean supervector which is then used as input for the mixture model aiming at clustering the speakers. They compare the performance of finite mixtures of Gaussian distributions with those of vMF distributions and conclude that for this application the latter outperform finite mixtures of multivariate normal distributions. Further possible areas of application for finite mixtures of vMF distributions are segmentation in high angular resolution diffusion imaging \citep{vMF:McGraw+Vemuri+Yezierski:2006} and clustering treatment beam directions in external radiation therapy \citep{vMF:Bangert+Hennig+Oelfke:2010}. This paper is structured as follows. Section~\ref{sec:vmf-distribution} introduces the vMF distribution and shows how to draw samples from this distribution and how to determine the ML estimates of its parameters. The extension to finite mixture models is covered in Section~\ref{sec:finite-mixtures-vmf} including again the methods for drawing samples and determining ML estimates. The \proglang{R} \citep{vMF:Team:2014} package \pkg{movMF} is presented in Section~\ref{sec:software} by describing the main fitting function \fct{movMF} in detail. The package is available from the Comprehensive \proglang{R} Archive Network at \url{https://CRAN.R-project.org/package=movMF}. Numerical issues when evaluating the density or determining the ML estimates are discussed in Section~\ref{sec:numerics}. An illustrative application of the package using the abstracts from the talks at ``use\proglang{R}! 2008'', the 3rd international \proglang{R} user conference, in Dortmund, Germany, is given in Section~\ref{sec:appl-user-2008}. The paper concludes with a summary and an outlook on possible extensions. %%------------------------------------------------------------------------ %%------------------------------------------------------------------------ \section{The vMF distribution}\label{sec:vmf-distribution} A random unit length vector in $\R^d$ has a \emph{von Mises-Fisher} (or Langevin) distribution with parameter~$\theta \in \R^d$ if its density with respect to the uniform distribution on the unit sphere $\S^{d-1} = \{x \in \mathbb{R}^d: \|x\| = 1 \}$ is given by \begin{displaymath} f(x|\theta) = e^{\theta^{\top} x} / \pFq{0}{1}(; d/2; \|\theta\|^2 / 4), \end{displaymath} where \begin{displaymath} \pFq{0}{1}(; \nu; z) = \sum_{n=0}^\infty \frac{\Gamma(\nu)}{\Gamma(\nu+n)}\frac{z^n}{n!} \end{displaymath} is the confluent hypergeometric limit function and related to the modified Bessel function of the first kind $I_\nu$ via \begin{equation} \label{eq:0F1-and-I} \pFq{0}{1}(; \nu + 1; z^2 / 4) = \frac{I_\nu(z) \Gamma(\nu + 1)}{(z / 2)^\nu} \end{equation} \citep[e.g.,][page 168]{vMF:Mardia+Jupp:1999}. The vMF distribution is commonly parametrized as $\theta = \kappa \mu$, where $\kappa = \|\theta\|$ and $\mu \in \S^{d-1}$ are the \emph{concentration} and \emph{mean direction} parameters, respectively (if $\theta \ne 0$, $\mu$ is uniquely determined as $\theta / \|\theta\|$). In what follows, it will be convenient to write \begin{displaymath} C_d(\kappa) = 1 / \pFq{0}{1}(; d/2; \kappa^2 / 4) \end{displaymath} so that the density is given by \begin{displaymath} f(x | \theta) = C_d(\|\theta\|) e^{\theta^{\top}x}. \end{displaymath} \subsection{Simulating vMF distributions} \label{sec:simul-vmf-distr} The following algorithm provides a rejecting sampling scheme for drawing a sample from the vMF distribution with modal direction $(0,\ldots,0,1)$ and concentration parameter $\kappa \geq 0$ \citep[see Algorithm VM$^*$ in][]{vMF:Wood:1994}. The extension for simulating from the matrix Bingham vMF distribution is described in \citet{vMF:Hoff:2009} and also available in \proglang{R} through package \pkg{rstiefel} \citep{vMF:Hoff:2012}. \begin{enumerate}[Step 1.] \item Calculate $b$ using \begin{align*} b = \frac{d-1}{2 \kappa + \sqrt{4 \kappa^2 + (d - 1)^2}}. \end{align*} Note that this calculation of $b$ is algebraically equivalent to the one proposed in \citet{vMF:Wood:1994}, but numerically more stable. Put $x_0 = (1 - b) / (1 + b)$ and $c = \kappa x_0 + (d - 1) \log(1-x^2_0)$. \item\label{item:z} Generate $Z \sim \text{Beta}((d-1)/2, (d-1)/2)$ and $U \sim \text{Unif}([0, 1])$ and calculate \begin{align*} W &= \frac{1 - (1 + b)Z}{1 - (1 - b)Z}. \end{align*} \item If \begin{align*} \kappa W + (d - 1) \log(1 - x_0 W) - c & < \log(U), \end{align*} go to Step~\ref{item:z}. \item Generate a uniform $(d-1)$-dimensional unit vector $V$ and return \begin{align*} X &= (\sqrt{1 - W^2} V^{\top}, W)^{\top}. \end{align*} \end{enumerate} The uniform $(d-1)$-dimensional unit vector $V$ can be generated by simulating independent standard normal random variables and normalizing them \citep[see for example][]{vMF:Ulrich:1984}. To get samples from a vMF distribution with arbitrary mean direction parameter $\mu$, $X$ is multiplied from the left with a matrix where the first $(d-1)$ columns consist of unitary basis vectors of the subspace orthogonal to $\mu$ and the last column is equal to $\mu$. \pagebreak \subsection{Estimating the parameters of the vMF distribution}\label{sec:estim-param-vmf} Using the common parametrization by $\kappa$ and $\mu$, the log-likelihood of a sample $x_1, \ldots, x_n$ from the vMF distribution is given by \begin{displaymath} n \log(C_d(\kappa)) + \kappa \mu^{\top} r, \end{displaymath} where $r = \sum_{i=1}^n x_i$ is the resultant vector (sum) of the $x_i$. The maximum likelihood estimates (MLEs) are obtained by solving the likelihood equations \begin{displaymath} \hat{\mu} = r / \|r\|, \qquad - \frac{C_d'(\hat{\kappa})}{C_d(\hat{\kappa})} = \|r\| / n. \end{displaymath} Writing $A_d(\kappa) = - C_d'(\kappa) / C_d(\kappa)$ for the logarithmic derivative of $1 / C_d(\kappa)$ and $\rho = \|r\| / n$ for the average resultant length, the equation for the MLE of $\kappa$ becomes \begin{displaymath} A_d(\hat{\kappa}) = \rho. \end{displaymath} Using recursions for the modified Bessel function \citep[e.g.,][page 71]{vMF:Watson:1995}, one can establish that \begin{displaymath} A_d(\kappa) = - \frac{C_d'(\kappa)}{C_d(\kappa)} = \frac{I_{d/2}(\kappa)}{I_{d/2-1}(\kappa)}. \end{displaymath} It can be shown \citep[see for example][]{vMF:Schou:1978} that $A_d$ is a strictly increasing function which maps the interval $[0, \infty)$ onto the interval $[0, 1)$ and satisfies the Riccati equation $A_d'(\kappa) = 1 - A_d(\kappa)^2 - \frac{d-1}{\kappa} A_d(\kappa)$. As $A_d$ and hence its derivatives can efficiently be computed using continued fractions (see Section~\ref{sec:numerics} for details), the equation $A_d(\hat{\kappa}) = \rho$ can efficiently be solved by standard iterative techniques provided that good starting approximations are employed. \citet{vMF:Dhillon+Sra:2003} and subsequently \citet{vMF:Banerjee+Dhillon+Ghosh:2005} suggest the approximation \begin{equation} \label{eq:kappa_hat_approx} \hat{\kappa} \approx \frac{\rho ( d - \rho^2)}{1 - \rho^2} \end{equation} obtained by truncating the Gauss continued fraction representation of $A_d$ and adding a correction term ``determined empirically''. The former reference also suggests using this approximation as the starting point of a Newton-Raphson iteration, using the above expression for $A_d'$. \citet{vMF:Tanabe+Fukumizu+Oba:2007} show that \begin{displaymath} \hat{\kappa} = \frac{\rho (d - c)}{1 - \rho^2} \end{displaymath} for some suitable $0 \le c \le 2$. The approximation in Equation~\ref{eq:kappa_hat_approx} corresponds to $c \approx \rho^2$. They also suggest to determine $\hat{\kappa}$ via the fixed point iteration \begin{displaymath} \kappa_{t+1} = \kappa_t \rho / A_d(\kappa_t) \end{displaymath} with a starting value in the solution range, e.g., using $c = 1$ or $c = \rho^2$. \citet{vMF:Sra:2012} introduces a ``truncated Newton approximation'' based on performing exactly two Newton iterations \begin{displaymath} \kappa_{t+1} = \kappa_t - \frac{A_d(\kappa_t) - \rho}{A_d'(\kappa_t)} \end{displaymath} with $\kappa_0$ determined via the $c = \rho^2$ approximation. \citet{vMF:Song+Liu+Wang:2012} suggest to use a ``truncated Halley approximation'' based on performing exactly two Halley iterations \begin{displaymath} \kappa_{t+1} = \kappa_t - \frac{2 (A_d(\kappa_t) - \rho)A_d'(\kappa_t)} {2 A_d'(\kappa_t)^2 - (A_d(\kappa_t) - \rho)A_d''(\kappa_t)} \end{displaymath} with $\kappa_0$ determined via the $c = \rho^2$ approximation and using that the second derivative can be given as a function of $A_d(\kappa_t)$, i.e., \begin{displaymath} A_d''(\kappa_t) = 2 A_d(\kappa_t)^3 + \frac{3(d - 1)}{\kappa} A_d(\kappa_t)^2 + \frac{d^2 - d - 2 \kappa^2}{\kappa^2} A_d(\kappa_t) - \frac{d - 1}{\kappa}. \end{displaymath} The results in \citet{vMF:Hornik+Gruen:2014a} yield the substantially improved bounds \begin{equation} \label{eq:bounds-for-kappa-hat} \max \left( F_{d/2-1,d/2+1}(\rho), F_{(d-1)/2,\sqrt{d^2-1}/4}(\rho) \right) \le \hat{\kappa} \le F_{(d-1)/2,(d+1)/2}(\rho), \end{equation} valid for $0 \le \rho < 1$, where \begin{displaymath} F_{\alpha,\beta}(\rho) = \frac{\rho}{1 - \rho^2} \left( \alpha + \sqrt{\rho^2 \alpha^2 + (1 - \rho^2) \beta^2} \right). \end{displaymath} The difference between the upper and lower bound is at most $3\rho / 2$ for all $0 \le \rho < 1$, and the difference between the lower bound and $\hat{\kappa}$ tends to 0 as $\rho \to 1-$. Convex combinations of the lower and the upper bounds can be employed as starting values for the above iteration schemes (which require a single starting point). In addition, as these bounds actually give an interval known to contain the unique root $\hat{\kappa}$ of the function $\kappa \mapsto A_d(\kappa) - \rho$, one can use them as starting values for root finding methods which iteratively refine intervals containing the solution, such as simple bisection (as provided by \fct{uniroot} in \proglang{R}), hybrid algorithms combining derivative-based (Newton or Halley) and bisection steps \citep[e.g.][page~366]{vMF:Press+Teukolsky+Vetterling:2002}, or the Newton-Fourier method \citep[e.g.][pages~62--64]{vMF:Atkinson:1989}. One can show that $A_d$ is concave \citep[e.g., using Theorem~11 in][ which establishes that $A_d = R_{d/2-1}$ is the pointwise minimum of concave functions, and hence concave]{vMF:Hornik+Gruen:2013}: hence, employing the above bounds and a variant of the Newton-Fourier method for strictly increasing concave functions yields a quadratically convergent iterative scheme for determining $\hat{\kappa}$. \subsection{Illustrative example: Household expenses}\label{sec:illustr-exampl-hous-data} To illustrate the use of the vMF distribution to model data on the sphere we use the \code{household} data set from package \pkg{HSAUR3} \citep{vMF:Everitt+Hothorn:2018}. The data are part of a data set collected from a survey on household expenditures and give the expenses of 20 single men and 20 single women on four commodity groups. In the following we will focus only on three of those commodity groups (housing, food and service) to have 3-dimensional data which is easier to visualize. % <>= data("household", package = "HSAUR3") x <- as.matrix(household[, c(1:2, 4)]) gender <- household$gender theta <- rbind(female = movMF(x[gender == "female", ], k = 1)$theta, male = movMF(x[gender == "male", ], k = 1)$theta) set.seed(2008) vMFs <- lapply(1:5, function(K) movMF(x, k = K, control= list(nruns = 20))) @ % <>= kappa <- row_norms(theta) tab2 <- table(max.col(vMFs[[2]]$P), household$gender) mc2 <- min(c(sum(diag(tab2)), sum(tab2) - sum(diag(tab2)))) tab3 <- table(max.col(vMFs[[3]]$P), household$gender) @ % The data points are projected onto the sphere by normalizing them to have length one. Thus, in the following analysis we are interested in finding groups of households which lie in a similar direction, i.e., the angle between the observations is small and we are not interested in differences in their total absolute expenses. The data points on the sphere are visualized in Figure~\ref{fig:household-example} on the top left. Using the gender information, vMF distributions are fitted to the male and the female observations separately. The fitted distributions are visualized together with the mean direction indicated by a cross and with confidence circles of probability 50\% (full lines) and 95\% (dashed lines). Clearly the females have a smaller dispersion as indicated by the estimated $\kappa$ which is equal to \Sexpr{round(kappa[1],digits=1)}, as compared to the $\kappa$ of the males which is given by \Sexpr{round(kappa[2],digits=1)}. \begin{figure}[t!] \centering <>= par(mar = 0.1 + c(0, 0.5, 2, 0), mfrow = c(2, 2)) plotGlobe(x, household$gender, main = "Data", theta = -30, phi = 10, zlim = c(0, 1)) plotGlobe(x, household$gender, theta, main = "Known group membership", theta = -30, phi = 10, zlim = c(0, 1)) mu <- lapply(vMFs, function(x) x$theta / row_norms(x$theta)) ordering <- lapply(mu, function(x) order(x[,1], decreasing = TRUE)) class <- alpha <- kappa <- theta <- vector("list", length(ordering)) for (i in seq_along(ordering)) { alpha[[i]] <- vMFs[[i]]$alpha[ordering[[i]]] mu[[i]] <- mu[[i]][ordering[[i]],,drop=FALSE] theta[[i]] <- vMFs[[i]]$theta[ordering[[i]],,drop=FALSE] kappa[[i]] <- row_norms(vMFs[[i]]$theta[ordering[[i]],,drop=FALSE]) class[[i]] <- max.col(vMFs[[i]]$P[,ordering[[i]]]) } for (k in 2:3) { plotGlobe(x, class[[k]], theta[[k]], main = paste("Mixtures of vMFs with K =", k), theta = -30, phi = 10, zlim = c(0, 1)) } @ \setkeys{Gin}{width=0.8\textwidth} \caption{Household expenses data with gender indicated by color after projection to the sphere at the top left, estimated vMF distributions with confidence circles for each gender group separately at the top right and the estimated mixtures of vMF distributions with $K = 2$ and $K = 3$ with confidence circles at the bottom.} \label{fig:household-example} \end{figure} %%------------------------------------------------------------------------ %%------------------------------------------------------------------------ \section{Finite mixtures of vMF distributions}\label{sec:finite-mixtures-vmf} The mixture model with $K$ components is given by \begin{align*} h(x | \Theta) & = \sum_{k = 1}^K \alpha_k f(x | \theta_k), \end{align*} where $h(\cdot | \cdot)$ is the mixture density, $\Theta$ the vector with all $\alpha$ and $\theta$ parameters, and $f(y | \theta_k)$ the density of the vMF distribution with parameter $\theta_k$. Furthermore, the component weights $\alpha_k$ are positive for all $k$ and sum to one. \subsection{Simulating mixtures of vMF distributions} This is straightforwardly achieved by first sampling class ids $z \in \{1, \ldots, K\}$ with the mixture class probabilities $\alpha_1, \ldots, \alpha_K$, and then sampling the data from the respective vMF distributions with parameter $\theta_k$. \subsection{Estimating the parameters of mixtures of vMF distributions}\label{sec:estim-param-mixt} EM algorithms for ML estimation of the parameters of mixtures of vMF distributions are given in \citet{vMF:Dhillon+Sra:2003} and \citet{vMF:Banerjee+Dhillon+Ghosh:2005}. The EM algorithm exploits the fact that the complete-data log-likelihood where the component memberships of the observations are known is easier to maximize than the observed-data log-likelihood. The EM algorithm for fitting mixtures of vMF distributions consists of the following steps: \begin{enumerate} \item Initialization: Either of the following two: \begin{enumerate}[(a)] \item Assign values to $\alpha_k$ and $\theta_k$ for $k=1,\ldots,K$, where $\alpha_k > 0$ and $\sum_{k=1}^K \alpha_k = 1$ and $\theta_k \neq \theta_l$ for all $k \neq l$ and $k,l=1,\ldots,K$. Start the EM algorithm with an E-step. \item Assign (probabilities of) component memberships to each of the $n$ observations. E.g., the output from spherical $k$-means \citep{vMF:Hornik+Feinerer+Kober:2012} can be used. Start the EM algorithm with an M-step. \end{enumerate} \item Repeat the following steps until the maximum number of iterations is reached or the convergence criterion is met. \begin{description} \item[E-step:] Because the complete-data log-likelihood is linear in the missing data which correspond to the component memberships, the E-step only consists of calculating the a-posteriori probabilities, the probabilities of belonging to a component conditional on the observed values, using \begin{align*} p(k|x_i, \Theta) &\propto \alpha_k f(x_i | \theta_k). \end{align*} \item[M-step:] Maximize the expected complete-data log-likelihood by determining separately for each $k$: \begin{align*} \hat{\alpha}_k &= \frac{1}{n} \sum_{i=1}^n p(k|x_i, \Theta),\\ \hat{\mu}_k &= \frac{\sum_{i=1}^n p(k|x_i, \Theta) x_i}{\|\sum_{i=1}^n p(k|x_i, \Theta) x_i\|},& - \frac{C_d'(\hat{\kappa}_k)}{C_d(\hat{\kappa}_k)} &= \frac{\|\sum_{i=1}^n p(k|x_i, \Theta) x_i\|}{\sum_{i=1}^n p(k|x_i, \Theta)}. \end{align*} $\hat{\kappa}_k$ can be determined via the approximation of Equation~\ref{eq:kappa_hat_approx}, or one of the improved methods discussed in Section~\ref{sec:estim-param-vmf}. \item[Convergence check:] Assess convergence by checking (either or both) \begin{enumerate}[(a)] \item if the relative absolute change in the log-likelihood values is smaller than a threshold $\epsilon_1$; \item if the relative absolute change in parameters is smaller than a threshold $\epsilon_2$. \end{enumerate} If converged, stop the algorithm. \end{description} \end{enumerate} This corresponds to the soft-movMF algorithm on page 1357 in \citet{vMF:Banerjee+Dhillon+Ghosh:2005}. In addition they propose the hard-movMF algorithm on page 1358. The algorithm above can be modified to the hard-movMF algorithm by adding a hardening step between E- and M-step: \begin{description} \item[H-step:] Replace the a-posteriori probabilities by assigning each observation with probability 1 to one of the components where its a-posteriori probability is maximum. Assuming the maximum is unique, this corresponds to \begin{align*} p(k|x_i, \Theta) &= \left\{\begin{array}{rl} 1,&\text{if } k = \argmax_h p(h|x_i, \Theta),\\ 0,&\text{otherwise}. \end{array} \right. \end{align*} If the maximum is not unique, assignment is randomly with equal probability to one of the $k \in \argmax_h p(h|x_i, \Theta)$, i.e., ties are broken at random. \end{description} This algorithm is also referred to as classification EM algorithm in the literature \citep{vMF:Celeux+Govaert:1992}. A further variant of the EM algorithm also considered for example in \cite{vMF:Celeux+Govaert:1992} would be the stochastic EM where instead of an hardening step a stochastic step is added between E- and M-step: \begin{description} \item[S-step:] Assign at random each observation to one component with probability equal to its a-posteriori probability. \end{description} The algorithm above determines the parameter estimates if the concentration parameters are allowed to vary freely over components. An alternative model specification could be to impose the restriction that the concentration parameters are the same for all components. This has the advantage that the clusters will be of comparable compactness and that spurious solutions containing small components with very large concentration parameters are eliminated. In the following we derive how the M-step needs to be modified if the concentration parameters are restricted to be the same over components. From Appendix A.2 of \citet{vMF:Banerjee+Dhillon+Ghosh:2005} the optimal unconstrained $\kappa_k$ can be obtained by solving \begin{displaymath} A_d(\kappa_k) = \frac{\left\| \sum_i p(k|x_i, \Theta) x_i\right\|}{\sum_i p(k|x_i, \Theta)}. \end{displaymath} If the $\kappa_k$ are constrained to be equal (but are not given), the optimal common $\kappa$ can be obtained as follows. Using Equation A.12 in \citet{vMF:Banerjee+Dhillon+Ghosh:2005}, the modified Lagrangian becomes \begin{eqnarray*} \lefteqn{ \sum_k \left( \sum_i \log(C_d(\kappa)) p(k|x_i,\Theta) + \sum_i \kappa \mu_k^{\top} x_i p(k|x_i,\Theta) \right) + \lambda_k (1 - \mu_k^{\top} \mu_k)} \\ &=& \log(C_d(\kappa)) \sum_{k,i} p(k|x_i,\Theta) + \kappa \sum_{k,i} \mu_k^{\top} x_i p(k|x_i,\Theta) + \lambda_k (1 - \mu_k^{\top} \mu_k) \\ &=& n \log(C_d(\kappa)) + \kappa \sum_{k,i} \mu_k^{\top} x_i p(k|x_i,\Theta) + \lambda_k (1 - \mu_k^{\top} \mu_k). \end{eqnarray*} Setting partials with respect to $\mu_k$ and $\lambda_k$ to zero as in the reference gives (again) \begin{displaymath} \mu_k = \frac{\sum_i x_i p(k|x_i,\Theta)}{\|\sum_i x_i p(k|x_i,\Theta)\|} \end{displaymath} and with these $\mu_k$ we obtain for $\kappa$ that \begin{displaymath} 0 = - A_d(\kappa) n + \sum_{k,i} \mu_k^{\top} x_i p(k|x_i,\Theta) = - A_d(\kappa) n + \sum_k \left\|\sum_i x_i p(k|x_i,\Theta)\right\|, \end{displaymath} i.e., $\kappa$ needs to solve the equation \begin{displaymath} A_d(\kappa) = \frac{1}{n} \sum_k \left\|\sum_i x_i p(k|x_i,\Theta)\right\|. \end{displaymath} \subsection{Illustrative example: Household expenses}\label{sec:illustr-exampl-hous-movMF} In the following the gender information is not used and it is investigated if finite mixtures allow to unravel a distinction between male and female respondents in their household expenses. Assuming that it is known that there are two underlying unobserved groups, a mixture with two components is fitted. The results are visualized in Figure~\ref{fig:household-example} at the bottom left. The colors are according to assignments to components using the maximum a-posteriori probabilities. These assignments lead to \Sexpr{c("one")[mc2]} misclassification, i.e., one female is assigned to the component with the higher dispersion. The estimated parameters and the BIC value of the model are given in Table~\ref{tab:household-expenses}. If the number of components is assumed to be unknown, the Bayesian information criterion (BIC) can be used to select a suitable number of components \citep[see for example][Chapter 6]{vMF:McLachlan+Peel:2000}. In this case the minimum BIC is obtained for three components if models in the range of $K = 1,\ldots,5$ are considered. The results of the mixture with $K = 3$ components are visualized in Figure~\ref{fig:household-example} at the bottom right and the estimated parameters and the BIC value are given in Table~\ref{tab:household-expenses}. In this case the male respondents are split into two groups with less dispersion each and different mean directions. The \proglang{R} code for reproducing these results is provided in Section~\ref{sec:illustr-exampl-hous-code} after introducing package \pkg{movMF}. % <>= X2 <- cbind(alpha[[2]], mu[[2]], kappa[[2]], c(BIC(vMFs[[2]]), NA)) X2 <- format(round(X2, digits = 2), nsmall = 2) X2[1,6] <- paste("$", X2[1,6], "$", sep = "") X2 <- gsub("NA", " ", X2) X2 <- apply(cbind(c("$K = 2$", ""), X2), 1, paste, collapse = "&") X3 <- cbind(alpha[[3]], mu[[3]], kappa[[3]], c(BIC(vMFs[[3]]), rep(NA, 2))) X3 <- format(round(X3, digits = 2), nsmall = 2) X3[1,6] <- paste("$", X3[1,6], "$", sep = "") X3 <- gsub("NA", " ", X3) X3 <- apply(cbind(c("$K = 3$", rep("", 2)), X3), 1, paste, collapse = "&") @ \begin{table}[t!] \centering \begin{tabular}{lccccrc} \hline &$\alpha$&\Sexpr{paste(colnames(x), collapse = "&")}&\multicolumn{1}{c}{$\kappa$}&BIC\\ \hline \Sexpr{X2[1]}\\ \Sexpr{X2[2]}\\ \hline \Sexpr{X3[1]}\\ \Sexpr{X3[2]}\\ \Sexpr{X3[3]}\\ \hline \end{tabular} \caption{Results of fitting mixtures of vMF distributions to the household expenses example.} \label{tab:household-expenses} \end{table} \pagebreak %%------------------------------------------------------------------------ %%------------------------------------------------------------------------ \section[Software]{Software}\label{sec:software} \subsection[Main fitting function movMF()]{Main fitting function \fct{movMF}} The main function in package \pkg{movMF} for fitting mixtures of vMF distributions is \fct{movMF}, with synopsis % <>= cat(paste("R> ", prompt(movMF, filename = NA)$usage[[2]])) @ % The arguments for this function are as follows. \begin{description} \item[\normalfont \code{x}:] A numeric data matrix, with rows corresponding to observations. If necessary the data is standardized to unit row lengths. Furthermore, the matrix can be either stored as a dense matrix, a simple triplet matrix (defined in package \pkg{slam}), or a general sparse triplet matrix of class `\code{dgtMatrix}' (from package \pkg{Matrix}). \item[\normalfont \code{k}:] An integer indicating the number of components. \item[\normalfont \code{control}:] A list of control parameters consisting of \begin{description} \item[\code{E}:] Specifies the variant of the EM algorithm used with possible values \code{"softmax"} (default), \code{"hardmax"} (classification EM), and \code{"stochmax"} (stochastic EM). \item[\code{kappa}:] This argument allows to specify how to determine the concentration parameters. \begin{itemize} \item If numbers are given, the concentration parameters are assumed to be fixed and are not estimated in the EM algorithm. \item The method for solving for the concentration parameters can be \mbox{specified} by one of \code{"Banerjee_et_al_2005"}, \code{"Tanabe_et_al_2007"}, \code{"Sra_2012"}, \code{"Song_et_al_2012"}, \code{"uniroot"}, \code{"Newton"}, \code{"Halley"}, \code{"hybrid"} and \code{"Newton_Fourier"} (default). For more details on these methods see Section~\ref{sec:estim-param-vmf}. \item For common concentration parameters a list with elements \code{common = TRUE} and a character string giving the estimation method needs to be provided. \end{itemize} \item[\code{converge}:] Logical indicating if convergence of the algorithm should be checked and if in this case the algorithm should be stopped before the maximum number of iterations is reached. For \code{E} equal to \code{"softmax"} this argument is set by default to \code{FALSE}. Note that only condition (a) of the convergence check (see Section~\ref{sec:estim-param-mixt}) is assessed, i.e., the relative change in the log-likelihood values. \item[\code{maxiter}:] Integer indicating the maximum number of iterations of the EM algorithm. (Default: 100.) \item[\code{reltol}:] If the relative change in the log-likelihood falls below this threshold the EM algorithm is stopped if \code{converge} is \code{TRUE}. (Default: \verb|sqrt(.Machine$double.eps)|.) \item[\code{verbose}:] Logical indicating if information on the progress of the fitting process shall be printed during the estimation. \item[\code{ids}:] Indicates either the class memberships of the observations or if equal to \code{TRUE} the class memberships are obtained from the attributes of the data. In this way the class memberships are for example stored if data is generated using function \code{rmovMF()}. If this argument is specified, the EM algorithm is stopped after one iteration, i.e., the parameter estimates are determined conditional on the known true class memberships. \item[\code{start}:] Allows to specify the starting values used for initializing the EM algorithm which then starts with an M-step. It can either be a list of matrices where each matrix contains the a-posteriori probabilities of the observations or a list of vectors containing component assignments for the observations. Alternatively it can be a character vector with entries \code{"i"}, \code{"p"}, \code{"S"} or \code{"s"}. The length of the vector specifies how many different initializations are made. \code{"i"} indicates to randomly assign component memberships to the observations. The latter three draw observations as prototypes and determine a-posteriori probabilities by taking the implied cosine dissimilarities between observations and prototypes. \code{"p"} randomly picks observations as prototypes, \code{"S"} takes the first prototype to minimize the total cosine dissimilarity to the observations, and then successively picks observations farthest away from the already picked prototypes. For \code{"s"} one takes a randomly chosen observation as the first prototype, and then proceeds as for \code{"S"}. For more details on these initialization methods see package \pkg{skmeans} \citep{vMF:Hornik+Feinerer+Kober:2012} which uses the same initialization schemes. \item[\code{nruns}:] An integer indicating the number of repeated runs of the EM algorithm with random initialization. This argument is ignored if either \code{ids} or \code{start} are specified. \item[\code{minalpha}:] Components with size below the threshold indicated by \code{minalpha} are omitted during the estimation with the EM algorithm. This avoids estimation problems in the M-step if only very few observations are assigned to one component. The disadvantage is that the initial number of components is not necessarily equal to the number of components of the returned model. \end{description} \end{description} \subsection[Additional functionality in movMF]{Additional functionality in \pkg{movMF}} The object returned by \fct{movMF} has an \proglang{S}3 class called `\code{movMF}' with methods \fct{print}, \fct{coef}, \fct{logLik} and \fct{predict} (yields either the component assignments or the matrix of a-posteriori probabilities). Additional functionality available in the package includes \fct{rmovMF} for drawing from a mixture of vMF distributions and \fct{dmovMF} for evaluating the mixture density. \subsection{Illustrative example: Household expenses}\label{sec:illustr-exampl-hous-code} In the following the code for reproducing the results presented in Section~\ref{sec:illustr-exampl-hous-movMF} is provided. After loading the dataset the three columns of interest are selected from the expenses and stored in variable \code{x}. Then the classification variable \code{gender} is also extracted. First \code{movMF()} is used to fit a single vMF distribution to the two separate sub-samples and then mixtures are fitted with number of components varying from 1 to 5. To avoid reporting sub-optimal solutions where the EM algorithm was trapped in a local optimum, the best result from 20 random initializations is returned. % <>= <> @ % The BIC values for the different mixtures can be compared using <<>>= sapply(vMFs, BIC) @ \section{Numerical issues} \label{sec:numerics} In what follows it will be convenient to write \begin{displaymath} H_\nu(\kappa) = \pFq{0}{1}(; \nu + 1; \kappa^2 / 4) = \frac{\Gamma(\nu + 1)}{(\kappa / 2)^\nu} I_\nu(\kappa). \end{displaymath} As shown in Section~\ref{sec:vmf-distribution}, computing log-likelihoods for (mixtures of) vMF distributions on $\R^d$ requires the computation of $\log(H_{d/2-1})$, and ML estimation of concentration parameters amounts to solving equations of the form $A_d(\kappa)=\rho$, where \begin{displaymath} A_d(\kappa) = \frac{I_{d/2}(\kappa)}{I_{d/2-1}(\kappa)} = \frac{\kappa}{d} \frac{H_{d/2}(\kappa)}{H_{d/2-1}(\kappa)} \end{displaymath} is the logarithmic derivative of $H_{d/2-1}$. As $H_\nu(\kappa) \to \infty$ as $\kappa \to \infty$ (in fact, quite rapidly, see below), it clearly is a bad idea to try to compute $\log(H)$ as the logarithm of $H$, or $A$ as the ratio of $H$ functions. Similar considerations apply for using logarithms or ratios of incomplete modified Bessel functions~$I$. One might wonder whether one could successfully take advantage of the fact that the Bessel functions provided by \proglang{R} are based on the \pkg{SPECFUN} package of \citet{vMF:Cody:1993} and hence also provide the exponentially scaled modified Bessel function $e^{-\kappa} I_\nu(\kappa)$ (the scaling is motivated by the asymptotic expansion $I_\nu(\kappa) \sim e^\kappa (2\pi\kappa)^{-1/2} \sum_m a_m(\nu) / \kappa^m$ for $\kappa \to \infty$). However, exponential scaling does not help in situations where $1 \ll \kappa \ll \nu$: e.g., for $\kappa = 6000$ and $\nu = 10000$, $H$ overflows whereas $I$ underflows (even though \proglang{R} gives \verb|Inf| and \verb|0| for the cases without and with exponential scaling, respectively). \subsection[Computing A]{Computing $A_d$} One can use the Gauss continued fraction \begin{displaymath} \frac{I_\nu(z)}{I_{\nu - 1}(z)} = \frac{1}{2\nu/z + {}} \frac{1}{2(\nu+1)/z + {}} \frac{1}{2(\nu+2)/z + {}} \cdots \end{displaymath} for the ratio of modified Bessel functions (\url{https://dlmf.nist.gov/10.33.E1}) to compute $A$ as \begin{displaymath} A_d(\kappa) = \frac{I_{d/2}(\kappa)}{I_{d/2-1}(\kappa)} = \frac{1}{d/\kappa + {}}\frac{1}{(d+2)/\kappa + {}} \frac{1}{(d+4)/\kappa + {}} \cdots \end{displaymath} \citep[Equation~4.3 in][]{vMF:Banerjee+Dhillon+Ghosh:2005}, using, e.g., Steed's method (e.g., \url{https://dlmf.nist.gov/3.10}) for evaluation. However, as pointed out by \citet{vMF:Gautschi+Slavik:1978} and \citet{vMF:Tretter+Walster:1980} \citep[and, quite recently, re-iterated by][]{vMF:Song+Liu+Wang:2012}, the Perron continued fraction \begin{displaymath} \frac{I_\nu(z)}{I_{\nu - 1}(z)} = \frac{z}{2\nu + z - {}} \frac{(2\nu+1)z}{2 \nu + 1 + 2z - {}} \frac{(2\nu+3)z}{2 \nu + 2 + 2z - {}} \frac{(2\nu+5)z}{2 \nu + 3 + 2z - {}} \cdots \end{displaymath} is numerically more stable (as computing it by forward recursion only accumulates positive terms, whereas for the Gauss continued fraction the terms alternate in sign), and converges substantially faster for positive $z \gg \nu$. Hence, by default we compute $A$ via the Perron continued fraction \citep[with implementation based on Equation~3.3$'$ in][]{vMF:Gautschi+Slavik:1978}, and additionally provide computation via the Gauss continued fraction \citep[with implementation based on Equation~3.2$'$ in][]{vMF:Gautschi+Slavik:1978} and using exponentiation of $\log(H)$ differences as alternatives. For $\kappa$ close to zero it is better (and necessary for $\kappa = 0$) to use the approximation \begin{displaymath} A_d(\kappa) = \frac{1}{d} \kappa - \frac{1}{d^2(d+2)} \kappa^3 + O(\kappa^5), \qquad \kappa \to 0 \end{displaymath} \citep[Equation 5]{vMF:Schou:1978}. The $O(\kappa^5)$ can be made more precise by using the series representation $I_\nu(z) = \sum_{n=0}^\infty (z/2)^{2n + \nu} / (n! \Gamma(n + \nu + 1))$ so that for $\kappa \to 0$, \begin{eqnarray*} A_d(\kappa) &=& \frac{\displaystyle (\kappa/2)^{d/2} \left( \frac{1}{\Gamma(d/2 + 1)} + \frac{\kappa^2/4}{\Gamma(d/2 + 2)} + \frac{\kappa^4/32}{\Gamma(d/2 + 3)} + O(\kappa^6) \right)}{\displaystyle (\kappa/2)^{d/2-1} \left( \frac{1}{\Gamma(d/2)} + \frac{\kappa^2/4}{\Gamma(d/2 + 1)} + \frac{\kappa^2/32}{\Gamma(d/2 + 2)} + O(\kappa^6) \right)} \\ &=& \frac{\displaystyle \kappa}{2} \frac{\frac{2}{d} + \frac{\kappa^2}{d(d+2)} + \frac{\kappa^4}{4d(d+2)(d+4)} + O(\kappa^6)}{\displaystyle 1 + \frac{\kappa^2}{2d} + \frac{\kappa^4}{8d(d+2)} + O(\kappa^6)} \end{eqnarray*} from which the coefficient of $\kappa^5$ can straightforwardly be determined as \begin{eqnarray*} \frac{1}{2} \left( \frac{1}{4d(d+2)(d+4)} - \frac{1}{2d^2(d+2)} - \frac{2}{8d^2(d+2)} + \frac{2}{4d^3} \right) &=& \frac{2}{d^3(d+2)(d+4)} \end{eqnarray*} so that \begin{displaymath} A_d(\kappa) = \frac{1}{d} \kappa - \frac{1}{d^2(d+2)} \kappa^3 + \frac{2}{d^3(d+2)(d+4)} \kappa^5 + O(\kappa^7), \qquad \kappa \to 0. \end{displaymath} From this approximations for $A'$ and $A''$ can be obtained by term-wise differentiation and also used for $\kappa$ close to 0. \subsection[Computing log(H)]{Computing $\log(H_{\nu})$} Write \begin{displaymath} R_\nu(\kappa) = \frac{I_{\nu+1}(\kappa)}{I_\nu(\kappa)} \end{displaymath} for the Bessel function ratio (so that $A_d = R_{d/2 - 1}$) and \begin{displaymath} G_{\alpha,\beta}(\kappa) = \frac{\kappa}{\alpha + \sqrt{\kappa^2 + \beta^2}}. \end{displaymath} \citet[][Equations 11 and 16]{vMF:Amos:1974} shows that for all non-negative $\kappa$ and $\nu$, $G_{\nu+1/2,\nu+3/2}(\kappa) \le R_\nu(\kappa) \le G_{\nu,\nu+2}(\kappa)$. With $\beta_{SS}(\nu) = \sqrt{(\nu+1/2)(\nu+3/2)}$, Theorem~2 of \citet{vMF:Simpson+Spector:1984} implies the upper bound $R_\nu(\kappa) \le G_{\nu+1/2,\beta_{SS}(\nu)}(\kappa)$ for all non-negative $\kappa$ and $\nu$. We thus have \begin{equation} \label{eq:bounds-for-R} G_{\nu+1/2,\nu+3/2}(\kappa) \le R_\nu(\kappa) \le \min \left( G_{\nu,\nu+2}(\kappa), G_{\nu+1/2,\beta_{SS}(\nu)}(\kappa) \right) \end{equation} (from which the bounds of Equation~\ref{eq:bounds-for-kappa-hat} are obtained by inversion). In addition, using results in \citet[][Equations~5 and 6]{vMF:Schou:1978}, one can show that $G_{\nu,\nu+2}$ and $G_{\nu+1/2,\beta_{SS}(\nu)}$ are second order exact approximations for $\kappa \to 0$ and $\kappa \to \infty$, respectively \citep{vMF:Hornik+Gruen:2013}. As $\log(H_\nu)' = R_\nu$ (and $H_\nu(0) = 1$), integration gives \begin{displaymath} \log(H_\nu(\kappa)) = \int_0^\kappa R_\nu(t) \,dt. \end{displaymath} Thus, the bounds for the Bessel function ratio $R_\nu$ can be used to obtain bounds for $H_\nu$. Writing \begin{displaymath} S_{\alpha,\beta}(\kappa) = \sqrt{\kappa^2 + \beta^2} - \alpha \log(\alpha + \sqrt{\kappa^2 + \beta^2}) - \beta + \alpha \log(\alpha + \beta), \end{displaymath} it is easily verified that $S_{\alpha,\beta}' = G_{\alpha,\beta}$ and $S_{\alpha,\beta}(0) = 0$. Using the Amos-type bounds from Equation~\ref{eq:bounds-for-R}, we thus obtain that for $\nu \ge 0$ and $\kappa \ge 0$, \begin{equation} \label{eq:bounds-for-log-H} S_{\nu+1/2,\nu+3/2}(\kappa) \le \log(H_\nu(\kappa)) \le \min \left( S_{\nu,\nu+2}(\kappa), S_{\nu+1/2,\beta_{SS}(\nu)}(\kappa) \right). \end{equation} Where a single approximating value is sought, we prefer to use the upper bound which is based on the combination of upper Amos-type bounds which are second order exact at zero and infinity. Again, the bounds in Equation~\ref{eq:bounds-for-log-H} are surprisingly tight. \par\medskip\noindent \textbf{Result.} \begin{em} Let \begin{displaymath} s(\nu) = (\nu+3/2) - \beta_{SS}(\nu) - (\nu+1/2) \log \frac{2 (\nu + 1)}{\nu + 1/2 + \beta_{SS}(\nu)}, \end{displaymath} with $\nu \geq 0$. Then $s$ is non-increasing on $[0, \infty)$ with $s(0) = (3 - \sqrt{3} + \log((1 + \sqrt{3}) / 4)) / 2 = 0.4433537$ and $\lim_{\nu \to \infty} s(\nu) = 1/4$. For $\nu_0 \ge 0$, \begin{displaymath} \sup_{\kappa \ge 0, \nu \ge \nu_0} (S_{\nu+1/2,\beta_{SS}(\nu)}(\kappa) - S_{\nu+1/2,\nu+3/2}(\kappa)) = s(\nu_0). \end{displaymath} For $\beta_{SS}(\nu) \le \beta \le \nu + 3/2$, \begin{displaymath} \sup_{\kappa \ge 0} |\log(H_\nu(\kappa)) - S_{\nu+1/2,\beta}(\kappa)| \le s(\nu). \end{displaymath} \end{em} \begin{proof} For simplicity, write $\alpha = \nu + 1/2$, $\beta_L = \nu + 3/2$ and $\beta_U = \beta_{SS}(\nu)$, omitting the dependence on $\nu$. We have $\beta_U \le \beta_L$ and $G_{\alpha,\beta_U} \ge G_{\alpha,\beta_L}$. Hence, \begin{displaymath} S_{\alpha,\beta_U}(\kappa) - S_{\alpha,\beta_L}(\kappa) = \int_0^\kappa (G_{\alpha,\beta_U}(t) - G_{\alpha,\beta_L}(t)) \, dt \end{displaymath} is non-decreasing in $\kappa$, and attains its supremum for $\kappa \to \infty$. Now, \begin{eqnarray*} \lefteqn{S_{\alpha,\beta_U}(\kappa) - S_{\alpha,\beta_L}(\kappa)} \\ &=& \sqrt{\kappa^2 + \beta_U^2} - \sqrt{\kappa^2 + \beta_L^2} - \alpha \log \left( \frac{\alpha + \sqrt{\kappa^2 + \beta_U^2}}{\alpha + \sqrt{\kappa^2 + \beta_L^2}} \right) + (\beta_L - \beta_U) - \alpha \log \frac{\alpha + \beta_L}{\alpha + \beta_U}. \end{eqnarray*} As \begin{displaymath} \sqrt{\kappa^2 + \beta_U^2} - \sqrt{\kappa^2 + \beta_L^2} = \frac{(\kappa^2 + \beta_U^2) - (\kappa^2 + \beta_L^2)}{ \sqrt{\kappa^2 + \beta_U^2} + \sqrt{\kappa^2 + \beta_L^2}} = \frac{\beta_U^2 - \beta_L^2}{ \sqrt{\kappa^2 + \beta_U^2} + \sqrt{\kappa^2 + \beta_L^2}}, \end{displaymath} we have \begin{displaymath} \lim_{\kappa \to \infty} (S_{\alpha,\beta_U}(\kappa) - S_{\alpha,\beta_L}(\kappa)) = (\beta_L - \beta_U) - \alpha \log \frac{\alpha + \beta_L}{\alpha + \beta_U} = s(\nu). \end{displaymath} The value of $s(0)$ is obtained by insertion, and $\lim_{\nu \to \infty} s(\nu)$ can be obtained as follows. We have \begin{displaymath} \beta_L - \beta_U = \sqrt{\alpha + 1} (\sqrt{\alpha + 1} - \sqrt{\alpha}) = \frac{\sqrt{\alpha + 1}}{\sqrt{\alpha + 1} + \sqrt{\alpha}} \to 1/2 \end{displaymath} as $\nu \to \infty$, and \begin{eqnarray*} \frac{\alpha + \beta_U}{\alpha + \beta_L} &=& \frac{\alpha + \sqrt{\alpha (\alpha + 1)}}{2\alpha + 1} \\ &=& \frac{(1 + \sqrt{1 + 1 / \alpha}) / 2}{1 + 1 / (2\alpha)} \\ &=& \frac{1}{2} \left(1 + 1 + \frac{1}{2\alpha} + O(\alpha^{-2}) \right) \left(1 - \frac{1}{2\alpha} + O(\alpha^{-2}) \right) \\ &=& 1 - \frac{1}{4\alpha} + O(\alpha^{-2}) \end{eqnarray*} so that \begin{displaymath} \alpha \log \frac{\alpha + \beta_U}{\alpha + \beta_L} = \alpha \left( - \frac{1}{4\alpha} + O(\alpha^{-2}) \right) = - \frac{1}{4} + O(\alpha^{-1}) \to -1/4 \end{displaymath} as $\nu \to \infty$. Hence, $\lim_{\nu \to \infty} s(\nu) = 1/4$. To show that $s$ is non-increasing, note that we have \begin{displaymath} \alpha + \beta_L = 2 \alpha + 1, \qquad \frac{d\alpha}{d\nu} = \frac{d\beta_L}{d\nu} = 1, \qquad \frac{d\beta_U}{d\nu} = \frac{2\alpha + 1}{2\beta_U} \end{displaymath} and hence \begin{displaymath} \frac{ds}{d\nu} = 1 - \frac{2\alpha + 1}{2\beta_U} - \log \frac{2\alpha + 1}{\alpha + \beta_U} - \alpha \left( \frac{2}{2\alpha+1} - \frac{1}{\alpha + \beta_U} \left( 1 + \frac{2\alpha + 1}{2\beta_U} \right) \right). \end{displaymath} For non-negative $t$ and $\tau$, \begin{displaymath} \log \frac{t+\tau}{t} = \int_0^\tau \frac{1}{t+s} \, ds \ge \frac{1}{t + \tau} \int_0^\tau ds = \frac{\tau}{t + \tau}. \end{displaymath} Hence, with $t = \alpha + \beta_U$ and $\tau = \beta_L - \beta_U$, \begin{displaymath} \log \frac{2\alpha + 1}{\alpha + \beta_U} \ge \frac{\beta_L - \beta_U}{2\alpha + 1} \end{displaymath} and \begin{eqnarray*} \frac{ds}{d\nu} &\le& 1 - \frac{2\alpha + 1}{2\beta_U} + \frac{\beta_U - \beta_L}{2\alpha + 1} + \frac{\alpha}{\alpha + \beta_U} \left( 1 + \frac{2\alpha + 1}{2\beta_U} \right) - \frac{2\alpha}{2\alpha+1} \\ &=& \frac{2\alpha+1}{2\beta_U} \left( \frac{\alpha}{\alpha + \beta_U} - 1 \right) + \frac{\alpha}{\alpha + \beta_U} + \frac{2\alpha + 1 + \beta_U - \beta_L - 2\alpha}{2\alpha + 1} \\ &=& - \frac{2\alpha + 1}{2(\alpha + \beta_U)} + \frac{\alpha}{\alpha + \beta_U} + \frac{\beta_U - \alpha}{2\alpha + 1} \\ &=& - \frac{1}{2(\alpha + \beta_U)} + \frac{\beta_U - \alpha}{2\alpha + 1} \\ &=& \frac{-(2\alpha + 1) + 2 (\beta_U^2 - \alpha^2)} {2(\alpha + \beta_U)(2\alpha + 1)} \\ &=& \frac{-1}{2(\alpha + \beta_U)(2\alpha + 1)} \\ &\le& 0, \end{eqnarray*} establishing that $s$ is non-increasing. Finally, for $\beta_{SS}(\nu) \le \beta \le \nu + 3/2$, both $\log(H_\nu)$ and $S_{\nu+1/2,\beta}$ are bounded below by $S_{\nu+1/2,\nu+3/2}$ and above by $S_{\nu+1/2,\beta_{SS}(\nu)}$, so that $| \log(H_\nu) - S_{\nu+1/2,\beta} | \le (S_{\nu+1/2,\beta_{SS}(\nu)} - S_{\nu+1/2,\nu+3/2}) \le s(\nu)$, completing the proof. \end{proof} These results can be used to derive the following approach to computing $\log(H_\nu)$. Choose a threshold $\theta$ such that $e^\theta$ does not overflow and $e^{-\theta}$ does not underflow. Using IEEE 754 double precision floating point computations, we can take $\theta = 700$. Choose an approximation $L_\nu$ for $\log(H_\nu)$ for which $S_{\nu+1/2,\nu+3/2} \le L_\nu \le S_{\nu+1/2,\beta_{SS}(\nu)}$ for all $\kappa \ge 0$. By the above, this has approximation error at most $s(0) < 1/2$. Possible choices for $L_\nu$ are convex combinations of the lower and upper bounds in Equation~\ref{eq:bounds-for-log-H} (e.g., simply take the upper bound) or $S_{\nu+1/2,\beta}(\kappa)$ with some $\beta_{SS}(\nu) \le \beta \le \nu + 3/2$ (e.g, $\beta = \nu + 1$); in the package we use \begin{eqnarray*} L_\nu(\kappa) &=& \int_0^\kappa \min(G_{\nu,\nu+2}(t), G_{\nu+1/2,\beta_{SS}(\nu)}(t)) \, dt \\ &=& S_{\nu+1/2,\beta_{SS}(\nu)}(\kappa) + (S_{\nu,\nu+2}(\min(\kappa,\kappa_\nu)) - S_{\nu+1/2,\beta_{SS}(\nu)}(\min(\kappa,\kappa_\nu))), \end{eqnarray*} where $\kappa_\nu = \sqrt{(3\nu+11/2) (\nu+3/2)}$ is the positive root of the equation $G_{\nu,\nu+2}(\kappa) = G_{\nu+1/2,\beta_{SS}(\nu)}(\kappa)$. Then if $L_\nu(\kappa) \le \theta - 1/2$ (so that $\log(H_\nu(\kappa)) \le \theta$), compute $H_\nu(\kappa)$ by its series expansion, and take the logarithm of this. If $\theta - 1 / 2< L_\nu(\kappa) \le 2 \theta - 1$ so that \begin{displaymath} |\log(H_\nu(\kappa)) - L_\nu(\kappa) / 2| \le |\log(H_\nu(\kappa)) - L_\nu(\kappa)| + L_\nu(\kappa) / 2 \le \theta, \end{displaymath} and hence $e^{-L_\nu(\kappa)/2} H_\nu(\kappa)$ does not over- or underflow, write \begin{displaymath} H_\nu(\kappa) = e^{L_\nu(\kappa) / 2} e^{- L_\nu(\kappa) / 2} H_\nu(\kappa) = e^{L_\nu(\kappa) / 2} \sum_{m=0}^\infty \frac{e^{-L_\nu(\kappa)/2}\Gamma(\nu+1)}{\Gamma(\nu+1+m)}\frac{(\kappa^2/4)^m}{m!}, \end{displaymath} and compute $\log(H_\nu(\kappa))$ as $L_\nu(\kappa)/2$ plus the logarithm of the sum of the scaled series. Otherwise, use the approximation $L_\nu(\kappa)$ (note that with $\theta = 700$, this has a relative approximation error of about at most $1/2 \cdot 1/1400 \le 0.0004$). This is the approach for computing $\log(H_\nu)$ currently implemented in package \pkg{movMF}. Alternatively, one can use the bounds to obtain refined strategies of computing $\log(H_\nu)$ using available codes for modified Bessel functions. We have \begin{displaymath} \log(H_\nu(\kappa)) = \log(I_\nu(\kappa)) - \nu \log(\kappa/2) + \log(\Gamma(\nu + 1)) \end{displaymath} and hence, using Stirling's approximation $\log(\Gamma(z)) \approx (z - 1/2) \log(z) - z + \log(2\pi) / 2$ and $L_\nu = S_{\nu+1/2,\nu+1}$ for notational convenience, gives \begin{eqnarray*} \lefteqn{\log(I_\nu(\kappa))} \\ &=& \log(H_\nu(\kappa)) + \nu \log(\kappa/2) - \log(\Gamma(\nu + 1)) \\ &\approx& L_\nu(\kappa) + \nu \log(\kappa/2) - (\nu + 1/2) \log(\nu + 1) + (\nu + 1) - \frac{\log(2\pi)}{2} \\ &=& \sqrt{\kappa^2 + (\nu + 1)^2} + (\nu + 1/2) \log \frac{\kappa}{\nu + 1/2 + \sqrt{\kappa^2 + (\nu + 1)^2}} \\ & & \quad - \, \frac{\log(\kappa/2)}{2} + (\nu + 1/2) \log\frac{2\nu + 3/2}{2(\nu + 1)} - \frac{\log(2\pi)}{2}. \end{eqnarray*} This implies \begin{eqnarray*} \log(I_\nu(\kappa)) &=& \sqrt{\kappa^2 + (\nu + 1)^2} + (\nu + 1/2) \log \frac{\kappa}{\nu + 1/2 + \sqrt{\kappa^2 + (\nu + 1)^2}} - \, \frac{\log(\kappa)}{2} + O(1), \end{eqnarray*} where the $O(1)$ can be made more precise, giving a first-order variant of the large $\nu$ uniform asymptotic approximation for $I_\nu$ given by \citet{vMF:Olver:1954}. (For $\nu \to \infty$, the error in the approximation for $\log(\Gamma)$ tends to zero, and as $(\nu + 1/2) \log ((2\nu + 3/2) / (2 \nu + 2)) \to -1/4$, the $O(1)$ becomes $-\log(\pi)/2 - 1 / 4 + o(1)$ (modulo the error in the approximation of $\log(H_\nu(\kappa))$ by $L_\nu(\kappa)$ which is at most $1/4 + o(1)$). From the above, we can see that when $\kappa \gg \nu$, $I_\nu(\kappa)$ overflows quite rapidly; on the other hand, for $\kappa = o(\nu)$ and $\nu \to \infty$, $I_\nu(\kappa)$ underflows quite rapidly. For computing logarithms, overflow can be avoided by employing the exponentially scaled modified Bessel function $e^{-\kappa} I_\nu(\kappa)$ \citep[as commonly available in codes for computing Bessel functions, such as the \pkg{SPECFUN} package][used by \proglang{R}]{vMF:Cody:1993} and computing $\log(I_\nu(\kappa)) = \kappa + \log(e^{-\kappa} I_\nu(\kappa))$. However, this clearly does not help avoiding underflow. This motivates the following strategy for computing $\log(I_\nu(\kappa))$ for a wide range of values. Start by computing a quick approximation $T_\nu(\kappa)$ to $\log(I_\nu)$, either using the above $L_\nu(\kappa) + \nu \log(\kappa/2) - \log(\Gamma(\nu + 1))$ or the leading term of the large $\nu$ uniform asymptotic approximation (in \proglang{R}, the latter is available via function \verb|besselI.nuAsym()| in package \pkg{Bessel}; \citealt{vMF:Maechler:2012}). If this is ``sufficiently small'' in absolute value (so that $I_\nu(\kappa)$ will neither over- nor underflow), compute $\log(I_\nu(\kappa))$ directly as the logarithm of $I_\nu(\kappa)$. Otherwise, if the approximation is ``too large'', but $T_\nu(\kappa) - \kappa$ is sufficiently small in absolute value so that the exponentially scaled $e^{-\kappa} I_\nu(\kappa)$ can be computed without over- or underflow, compute $\log(I_\nu(\kappa))$ as $\kappa + \log(e^{-\kappa} I_\nu(\kappa))$. Otherwise, use the quick approximation, or the large $\nu$ uniform asymptotic approximation with additional terms (package \pkg{Bessel} allows up to 4 additional terms). This strategy can readily be translated into a strategy for computing $\log(H_\nu)$. \newpage %%------------------------------------------------------------------------ %%------------------------------------------------------------------------ \setkeys{Gin}{width=\textwidth} \section{Application: useR! 2008 abstracts}\label{sec:appl-user-2008} In 2008 the ``use\proglang{R}! 2008'', the 3rd international \proglang{R} user conference, took place in Dortmund, Germany. In total \Sexpr{nrow(useR_2008_abstracts)} abstracts were submitted and accepted for presentation at the conference. The abstracts with additional information such as title, author, session, and keywords were originally made available in the \proglang{R} data package \pkg{corpus.useR.2008.abstracts} available from the repository at \url{https://datacube.wu.ac.at/}. The dataset is also included in package \pkg{movMF}. The following code loads the data contained in package \pkg{movMF}. % <>= <> @ % Using the \pkg{tm} package \citep{vMF:Feinerer+Hornik+Meyer:2008, vMF:Feinerer:2012} the data can be pre-processed by (1) generating a corpus from the vector of abstracts and (2) building a document-term matrix from the corpus. For constructing the document-term matrix each abstract needs to be tokenized (i.e., split into words, e.g., by using white space characters as separators) and transformed to lower case. Punctuation as well as numbers can be removed and the words can be stemmed (i.e., inflected words are reduced to a base form). In addition a minimum length can be imposed on the words as well as a minimum and maximum frequency within an abstract required. The vector of abstracts is transformed to an object which has an extended class of `\code{Source}' using \fct{VectorSource}. This object is used as input for \fct{Corpus} to generate the corpus. The map from the corpus to the document-term matrix is performed using \fct{DocumentTermMatrix}. The \code{control} argument of \fct{DocumentTermMatrix} specifies which pre-processing steps are applied to determine the frequency vectors of term occurrences in each abstract. We use the titles and the abstracts together to construct the document-term matrix. % <<>>= library("tm") abstracts_titles <- apply(useR_2008_abstracts[,c("Title", "Abstract")], 1, paste, collapse = " ") useR_2008_abstracts_corpus <- VCorpus(VectorSource(abstracts_titles)) useR_2008_abstracts_DTM <- DocumentTermMatrix(useR_2008_abstracts_corpus, control = list( tokenize = "MC", stopwords = TRUE, stemming = TRUE, wordLengths = c(3, Inf))) @ Method \code{"MC"} was used for tokenizing. This method aims at producing the same results as the \pkg{MC} toolkit for creating vector models from text documents \citep{vMF:Dhillon+Modha:2001, vMF:Dhillon+Fan+Guan:2001}. In addition the words are stemmed, a set of stop words are removed and all words are kept which have a length of at least 3. The resulting document-term matrix has \Sexpr{nrow(useR_2008_abstracts_DTM)} rows and \Sexpr{ncol(useR_2008_abstracts_DTM)} columns. The ten most frequent terms (occurring in different abstracts) are the following. % <<>>= library("slam") ColSums <- col_sums(useR_2008_abstracts_DTM > 0) sort(ColSums, decreasing = TRUE)[1:10] @ % To reduce the dimension of the problem and omit terms which occur too frequently or too infrequently in the corpus to be of use when clustering the abstracts, we omit all terms which occur in less than 5 abstracts and more than 90 abstracts. % <<>>= useR_2008_abstracts_DTM <- useR_2008_abstracts_DTM[, ColSums >= 5 & ColSums <= 90] useR_2008_abstracts_DTM @ % The data is transformed using TF-IDF weighting. % <<>>= useR_2008_abstracts_DTM <- weightTfIdf(useR_2008_abstracts_DTM) @ % In the following different mixtures of vMF distributions are fitted to training data using 10-fold cross-validation and are compared based on the predictive log-likelihoods on the hold-out data to select a suitable model. The numbers of components are varied and the mixtures are fitted with concentration parameters constrained to be the same over components as well as where the concentration parameters are allowed to freely vary over components. For each training data set the EM algorithm is repeated 20 times with different random initializations. % <>= set.seed(2008) library("movMF") Ks <- c(1:5, 10, 20) splits <- sample(rep(1:10, length.out = nrow(useR_2008_abstracts_DTM))) useR_2008_movMF <- lapply(Ks, function(k) sapply(1:10, function(s) { m <- movMF(useR_2008_abstracts_DTM[splits != s,], k = k, nruns = 20) logLik(m, useR_2008_abstracts_DTM[splits == s,]) })) useR_2008_movMF_common <- lapply(Ks, function(k) sapply(1:10, function(s) { m <- movMF(useR_2008_abstracts_DTM[splits != s,], k = k, nruns = 20, kappa = list(common = TRUE)) logLik(m, useR_2008_abstracts_DTM[splits == s,]) })) @ % <>= if(cache & file.exists("movMF.rda")) { load("movMF.rda") library("movMF") Ks <- c(1:5, 10, 20) } else { <> if(cache) { save(useR_2008_movMF, useR_2008_movMF_common, file = "movMF.rda") } else { if(file.exists("movMF.rda")) file.remove("movMF.rda") } } @ % In Figure~\ref{fig:logliks} the fitted models are compared using the cross-validated predictive log-likelihoods. The results for the models with free concentration parameters are on the left, for the models with common concentration parameters on the right. The predictive log-likelihoods on the hold-out data indicate that the best solutions have between 1 and 5 components and that the models with more components have rather bad predictive log-likelihoods. \begin{figure}[t!] \centering <>= logLiks <- data.frame(logLik = c(unlist(useR_2008_movMF), unlist(useR_2008_movMF_common)), K = c(rep(Ks, sapply(useR_2008_movMF, length)), rep(Ks, sapply(useR_2008_movMF_common, length))), Dataset = seq_len(length(useR_2008_movMF[[1]])), Method = factor(rep(1:2, each = length(unlist(useR_2008_movMF))), 1:2, c("free", "common"))) logLiks$logLik <- logLiks$logLik - rep(rep(with(logLiks, tapply(logLik, Dataset, mean)), length(Ks)), 2) print(xyplot(logLik ~ K | Method, data = logLiks, groups = Dataset, type = "l", lty = 1, xlab = "Number of components", ylab = "Predictive log-likelihood", strip = strip.custom(factor.levels = expression(paste("Free ", kappa), paste("Common ", kappa))))) @ \caption{Predictive log-likelihoods for different and common concentration parameters $\kappa$ for the fitted mixtures of vMF distributions to the ``use\proglang{R}! 2008'' abstracts. } \label{fig:logliks} \end{figure} Following the conclusions of the comparison of the predictive log-likelihoods values we further investigate the model where the concentration parameters are constrained to be equal over components and the number of components is equal to 2. % <<>>= set.seed(2008) best_model <- movMF(useR_2008_abstracts_DTM, k = 2, nruns = 20, kappa = list(common = TRUE)) @ % In the following we look at the 10 most important words of each fitted component: % <<>>= apply(coef(best_model)$theta, 1, function(x) colnames(coef(best_model)$theta)[order(x, decreasing = TRUE)[1:10]]) @ % Clearly one component deals with issues related to infrastructure or implementation, while the other component is focusing more on statistical modeling. The clustering obtained from the a-posteriori probabilities is analyzed by comparing the cluster membership with the keywords assigned to an abstract. Because each abstract might have several keywords assigned, the abstracts and their cluster assignments are suitably repeated. % <<>>= clustering <- predict(best_model) keywords <- useR_2008_abstracts[, "Keywords"] keywords <- sapply(keywords, function(x) sapply(strsplit(x, ", ")[[1]], function(y) strsplit(y, "-")[[1]][1])) tab <- table(Keyword = unlist(keywords), Component = rep(clustering, sapply(keywords, length))) @ % In the following only keywords are shown which have more than 8 abstracts assigned to them. % <<>>= (tab <- tab[rowSums(tab) > 8, ]) @ % The table is also visualized in Figure~\ref{fig:mosaic}. Abstracts where the keywords assigned relate to infrastructure or implementational issues such as connectivity, high performance computing and user interfaces are associated with one component, whereas abstracts which are related to statistical modeling issues such as bioinformatics, biostatistics and modeling are more likely to be assigned to the other component. \begin{figure}[t!] \centering <>= library("vcd") mosaic(tab, shade = TRUE, labeling_args = list(rot_labels = 0, just_labels = c("center", "right"), pos_varnames = c("left", "right"), rot_varnames = 0)) @ \caption{Cross-tabulation of the keywords in each session and the cluster assignment for keywords which were assigned to more than 8 abstracts.} \label{fig:mosaic} \end{figure} %%------------------------------------------------------------------------ %%------------------------------------------------------------------------ \section{Summary}\label{sec:summary} An \proglang{R} package for fitting finite mixtures of vMF distributions is presented. Special focus has been given on numerical issues when evaluating the log-likelihood as well as on methods for ML estimation of the concentration parameter. A possible extension for package \pkg{movMF} would be to allow for more flexible distributions in the components which also have as support the unit sphere. The vMF distribution is the analogue of the isotropic multivariate normal distribution, i.e., where the variance-covariance matrix is a multiple of the identity matrix. In $\mathbb{R}^3$ the generalization of the vMF distribution which is the analogue to the general multivariate normal distribution on the two-dimensional unit sphere is the Fisher-Bingham (or Kent) distribution \citep{vMF:Kent:1982}. Finite mixtures of Fisher-Bingham distributions were used in \cite{vMF:Peel+Whiten+McLachlan:2001} to identify joint sets present in rock mass. They also allowed for an additional component which followed the uniform distribution on the unit sphere. However, no generalization for higher dimensions are available and \cite{vMF:Dortet-Bernadet+Wicker:2008} propose to use inverse stereographic projections of multivariate normal distributions to cluster gene expression profiles. %%------------------------------------------------------------------------ %%------------------------------------------------------------------------ \section*{Acknowledgments} This research was supported by the Austrian Science Fund (FWF) under Elise-Richter grant V170-N18. We would like to thank Achim Zeileis and Uwe Ligges for helping generating the corpus of ``use\proglang{R}! 2008'' abstracts. %%------------------------------------------------------------------------ %%------------------------------------------------------------------------ \bibliography{vMF} \end{document}