VariableSelection()
.VariableSelection()
can accept either a
BranchGLM
object or a formula along with the data and the
desired family and link to perform the variable selection.VariableSelection()
returns some information about the
search, more detailed information about the best models can be seen by
using the summary()
function.VariableSelection()
will properly handle
interaction terms and categorical variables.keep
can also be specified if any set of variables are
desired to be kept in every model.plot
function can be used to see the path taken by
the stepwise algorithms# Loading BranchGLM package
library(BranchGLM)
# Fitting gamma regression model
cars <- mtcars
# Fitting gamma regression with inverse link
GammaFit <- BranchGLM(mpg ~ ., data = cars, family = "gamma", link = "inverse")
# Forward selection with mtcars
forwardVS <- VariableSelection(GammaFit, type = "forward")
forwardVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using forward selection with AIC
#> The top model found had AIC = 142.2
#> Number of models fit: 28
#> Variables that were kept in each model: (Intercept)
#> Order the variables were added to the model:
#>
#> 1). wt
#> 2). hp
## Getting final coefficients
coef(forwardVS, which = 1)
#> Model1
#> (Intercept) 8.922600e-03
#> cyl 0.000000e+00
#> disp 0.000000e+00
#> hp 8.887336e-05
#> drat 0.000000e+00
#> wt 9.826436e-03
#> qsec 0.000000e+00
#> vs 0.000000e+00
#> am 0.000000e+00
#> gear 0.000000e+00
#> carb 0.000000e+00
## Plotting path
plot(forwardVS)
# Backward elimination with mtcars
backwardVS <- VariableSelection(GammaFit, type = "backward")
backwardVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using backward elimination with AIC
#> The top model found had AIC = 141.9
#> Number of models fit: 50
#> Variables that were kept in each model: (Intercept)
#> Order the variables were removed from the model:
#>
#> 1). vs
#> 2). drat
#> 3). am
#> 4). disp
#> 5). carb
#> 6). cyl
## Getting final coefficients
coef(backwardVS, which = 1)
#> Model1
#> (Intercept) 4.691154e-02
#> cyl 0.000000e+00
#> disp 0.000000e+00
#> hp 6.284129e-05
#> drat 0.000000e+00
#> wt 9.484597e-03
#> qsec -1.298959e-03
#> vs 0.000000e+00
#> am 0.000000e+00
#> gear -2.662008e-03
#> carb 0.000000e+00
## Plotting path
plot(backwardVS)
Fast backward elimination is equivalent to traditional backward elimination, except much faster. The results from the two algorithms may differ if many of the larger models in the candidate set of models are difficult to fit numerically.
# Fast backward elimination with mtcars
fastbackwardVS <- VariableSelection(GammaFit, type = "fast backward")
fastbackwardVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using fast backward elimination with AIC
#> The top model found had AIC = 141.9
#> Number of models fit: 21
#> Variables that were kept in each model: (Intercept)
#> Order the variables were removed from the model:
#>
#> 1). vs
#> 2). drat
#> 3). am
#> 4). disp
#> 5). carb
#> 6). cyl
## Getting final coefficients
coef(fastbackwardVS, which = 1)
#> Model1
#> (Intercept) 4.691154e-02
#> cyl 0.000000e+00
#> disp 0.000000e+00
#> hp 6.284129e-05
#> drat 0.000000e+00
#> wt 9.484597e-03
#> qsec -1.298959e-03
#> vs 0.000000e+00
#> am 0.000000e+00
#> gear -2.662008e-03
#> carb 0.000000e+00
## Plotting path
plot(fastbackwardVS)
We got the same model that we got from traditional backward elimination, but we only had to fit 21 models while we had to fit 50 models using traditional backward elimination.
One of the double backward elimination variants could also be used to (potentially) find higher quality models than what is found by traditional backward elimination.
# Fast double backward elimination with mtcars
fastdoublebackwardVS <- VariableSelection(GammaFit, type = "fast double backward")
fastdoublebackwardVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using fast double backward elimination with AIC
#> The top model found had AIC = 141.9
#> Number of models fit: 22
#> Variables that were kept in each model: (Intercept)
#> Order the variables were removed from the model:
#>
#> 1). drat and vs
#> 2). disp and am
#> 3). cyl and carb
## Getting final coefficients
coef(fastdoublebackwardVS, which = 1)
#> Model1
#> (Intercept) 4.691154e-02
#> cyl 0.000000e+00
#> disp 0.000000e+00
#> hp 6.284129e-05
#> drat 0.000000e+00
#> wt 9.484597e-03
#> qsec -1.298959e-03
#> vs 0.000000e+00
#> am 0.000000e+00
#> gear -2.662008e-03
#> carb 0.000000e+00
## Plotting path
plot(fastdoublebackwardVS)
However, in this case they get the same final model and the double backward elimination algorithm fits slightly more models.
showprogress
is true, then progress of the branch
and bound algorithm will be reported occasionally.# Branch and bound with mtcars
VS <- VariableSelection(GammaFit, type = "branch and bound", showprogress = FALSE)
VS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> Found 1 model within 0 AIC of the best AIC(141.9)
#> Number of models fit: 50
#> Variables that were kept in each model: (Intercept)
## Getting final coefficients
coef(VS, which = 1)
#> Model1
#> (Intercept) 4.691154e-02
#> cyl 0.000000e+00
#> disp 0.000000e+00
#> hp 6.284129e-05
#> drat 0.000000e+00
#> wt 9.484597e-03
#> qsec -1.298959e-03
#> vs 0.000000e+00
#> am 0.000000e+00
#> gear -2.662008e-03
#> carb 0.000000e+00
BranchGLM
object.# Can also use a formula and data
formulaVS <- VariableSelection(mpg ~ . ,data = cars, family = "gamma",
link = "inverse", type = "branch and bound",
showprogress = FALSE, metric = "AIC")
formulaVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> Found 1 model within 0 AIC of the best AIC(141.9)
#> Number of models fit: 50
#> Variables that were kept in each model: (Intercept)
## Getting final coefficients
coef(formulaVS, which = 1)
#> Model1
#> (Intercept) 4.691154e-02
#> cyl 0.000000e+00
#> disp 0.000000e+00
#> hp 6.284129e-05
#> drat 0.000000e+00
#> wt 9.484597e-03
#> qsec -1.298959e-03
#> vs 0.000000e+00
#> am 0.000000e+00
#> gear -2.662008e-03
#> carb 0.000000e+00
# Finding top 10 models
formulaVS <- VariableSelection(mpg ~ . ,data = cars, family = "gamma",
link = "inverse", type = "branch and bound",
showprogress = FALSE, metric = "AIC",
bestmodels = 10)
formulaVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> The range of AIC values for the top 10 models is (141.9, 143.59)
#> Number of models fit: 98
#> Variables that were kept in each model: (Intercept)
## Plotting results
plot(formulaVS, type = "b")
## Getting all coefficients
coef(formulaVS, which = "all")
#> Model1 Model2 Model3 Model4
#> (Intercept) 4.691154e-02 8.922600e-03 2.896032e-02 1.972585e-02
#> cyl 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
#> disp 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
#> hp 6.284129e-05 8.887336e-05 5.590216e-05 9.987066e-05
#> drat 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
#> wt 9.484597e-03 9.826436e-03 1.105126e-02 8.373894e-03
#> qsec -1.298959e-03 0.000000e+00 -1.071038e-03 0.000000e+00
#> vs 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
#> am 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
#> gear -2.662008e-03 0.000000e+00 0.000000e+00 -2.092484e-03
#> carb 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
#> Model5 Model6 Model7 Model8 Model9
#> (Intercept) 6.463732e-02 7.472237e-03 4.763986e-02 0.048676830 2.578977e-02
#> cyl -1.412185e-03 1.087312e-03 0.000000e+00 0.000000000 0.000000e+00
#> disp 0.000000e+00 0.000000e+00 0.000000e+00 0.000000000 0.000000e+00
#> hp 7.523221e-05 7.196928e-05 5.802906e-05 0.000000000 8.564089e-05
#> drat 0.000000e+00 0.000000e+00 0.000000e+00 0.000000000 0.000000e+00
#> wt 1.037319e-02 8.958479e-03 8.858489e-03 0.013362765 7.595784e-03
#> qsec -1.815606e-03 0.000000e+00 -1.147336e-03 -0.002136415 0.000000e+00
#> vs 0.000000e+00 0.000000e+00 0.000000e+00 0.000000000 0.000000e+00
#> am 0.000000e+00 0.000000e+00 0.000000e+00 0.000000000 0.000000e+00
#> gear -3.860851e-03 0.000000e+00 -3.408010e-03 0.000000000 -3.358384e-03
#> carb 0.000000e+00 0.000000e+00 7.249656e-04 0.000000000 1.140652e-03
#> Model10
#> (Intercept) 4.213154e-02
#> cyl 0.000000e+00
#> disp 0.000000e+00
#> hp 6.607566e-05
#> drat 1.326266e-03
#> wt 9.761600e-03
#> qsec -1.298089e-03
#> vs 0.000000e+00
#> am 0.000000e+00
#> gear -3.029601e-03
#> carb 0.000000e+00
# Finding all models with an AIC within 2 of the best model
formulaVS <- VariableSelection(mpg ~ . ,data = cars, family = "gamma",
link = "inverse", type = "branch and bound",
showprogress = FALSE, metric = "AIC",
cutoff = 2)
formulaVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> Found 16 models within 2 AIC of the best AIC(141.9)
#> Number of models fit: 93
#> Variables that were kept in each model: (Intercept)
## Plotting results
plot(formulaVS, type = "b")
keep
will ensure that those
variables are kept through the selection process.# Example of using keep
keepVS <- VariableSelection(mpg ~ . ,data = cars, family = "gamma",
link = "inverse", type = "branch and bound",
keep = c("hp", "cyl"), metric = "AIC",
showprogress = FALSE, bestmodels = 10)
keepVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> The range of AIC values for the top 10 models is (143.17, 145.24)
#> Number of models fit: 45
#> Variables that were kept in each model: (Intercept), hp, cyl
## Getting summary and plotting results
plot(keepVS, type = "b")
## Getting coefficients for top 10 models
coef(keepVS, which = "all")
#> Model1 Model2 Model3 Model4 Model5
#> (Intercept) 6.463732e-02 7.472237e-03 0.0256340835 0.068289331 1.716490e-02
#> cyl -1.412185e-03 1.087312e-03 0.0004708918 -0.001632036 5.216391e-04
#> disp 0.000000e+00 0.000000e+00 0.0000000000 0.000000000 0.000000e+00
#> hp 7.523221e-05 7.196928e-05 0.0000530224 0.000071603 8.984711e-05
#> drat 0.000000e+00 0.000000e+00 0.0000000000 0.000000000 0.000000e+00
#> wt 1.037319e-02 8.958479e-03 0.0105114095 0.009728921 8.209926e-03
#> qsec -1.815606e-03 0.000000e+00 -0.0009269894 -0.001707464 0.000000e+00
#> vs 0.000000e+00 0.000000e+00 0.0000000000 0.000000000 0.000000e+00
#> am 0.000000e+00 0.000000e+00 0.0000000000 0.000000000 0.000000e+00
#> gear -3.860851e-03 0.000000e+00 0.0000000000 -0.004973355 -1.732000e-03
#> carb 0.000000e+00 0.000000e+00 0.0000000000 0.000886622 0.000000e+00
#> Model6 Model7 Model8 Model9
#> (Intercept) 5.958530e-02 6.575356e-02 6.604896e-02 0.0648442871
#> cyl -1.320709e-03 -1.277298e-03 -1.444133e-03 -0.0013699334
#> disp 0.000000e+00 -8.059797e-06 0.000000e+00 0.0000000000
#> hp 7.707172e-05 7.872558e-05 7.501653e-05 0.0000745678
#> drat 1.089999e-03 0.000000e+00 0.000000e+00 0.0000000000
#> wt 1.054618e-02 1.074571e-02 1.025472e-02 0.0104217585
#> qsec -1.782401e-03 -1.870329e-03 -1.867512e-03 -0.0018571564
#> vs 0.000000e+00 0.000000e+00 0.000000e+00 0.0002736861
#> am 0.000000e+00 0.000000e+00 -4.739251e-04 0.0000000000
#> gear -4.088866e-03 -4.086160e-03 -3.775475e-03 -0.0038351284
#> carb 0.000000e+00 0.000000e+00 0.000000e+00 0.0000000000
#> Model10
#> (Intercept) 0.0092037639
#> cyl 0.0008526030
#> disp 0.0000000000
#> hp 0.0000703095
#> drat 0.0000000000
#> wt 0.0090861507
#> qsec 0.0000000000
#> vs -0.0010182533
#> am 0.0000000000
#> gear 0.0000000000
#> carb 0.0000000000
VariableSelection()
# Variable selection with grouped beta parameters for species
Data <- iris
VS <- VariableSelection(Sepal.Length ~ ., data = Data, family = "gaussian",
link = "identity", metric = "AIC", bestmodels = 10,
showprogress = FALSE)
VS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using switch branch and bound selection with AIC
#> The range of AIC values for the top 10 models is (79.12, 183.94)
#> Number of models fit: 16
#> Variables that were kept in each model: (Intercept)
## Plotting results
plot(VS, cex.names = 0.75, type = "b")
# Treating categorical variable beta parameters separately
## This function automatically groups together parameters from a categorical variable
## to avoid this, you need to create the indicator variables yourself
x <- model.matrix(Sepal.Length ~ ., data = iris)
Sepal.Length <- iris$Sepal.Length
Data <- cbind.data.frame(Sepal.Length, x[, -1])
VSCat <- VariableSelection(Sepal.Length ~ ., data = Data, family = "gaussian",
link = "identity", metric = "AIC", bestmodels = 10,
showprogress = FALSE)
VSCat
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using switch branch and bound selection with AIC
#> The range of AIC values for the top 10 models is (79.12, 108.23)
#> Number of models fit: 21
#> Variables that were kept in each model: (Intercept)
## Plotting results
plot(VSCat, cex.names = 0.75, type = "b")