## ----echo = FALSE, message = FALSE, warning=FALSE----------------------------- library(dendextend) library(knitr) knitr::opts_chunk$set( cache = TRUE, dpi = 75, fig.width = 6, fig.height = 6, # dev = "svg", # comment = "#>", tidy = FALSE) # https://stackoverflow.com/questions/24091735/why-pandoc-does-not-retrieve-the-image-file # < ! -- rmarkdown v1 --> ## ----------------------------------------------------------------------------- iris <- datasets::iris iris2 <- iris[,-5] species_labels <- iris[,5] library(colorspace) # get nice colors species_col <- rev(rainbow_hcl(3))[as.numeric(species_labels)] ## ----fig.width=9, fig.height=9, fig.show='hold'------------------------------- # Plot a SPLOM: pairs(iris2, col = species_col, lower.panel = NULL, cex.labels=2, pch=19, cex = 1.2) # Add a legend par(xpd = TRUE) legend(x = 0.05, y = 0.4, cex = 2, legend = as.character(levels(species_labels)), fill = unique(species_col)) par(xpd = NA) ## ----fig.height=3------------------------------------------------------------- # http://blog.safaribooksonline.com/2014/03/31/mastering-parallel-coordinate-charts-r/ par(las = 1, mar = c(4.5, 3, 3, 2) + 0.1, cex = .8) MASS::parcoord(iris2, col = species_col, var.label = TRUE, lwd = 2) # Add Title title("Parallel coordinates plot of the Iris data") # Add a legend par(xpd = TRUE) legend(x = 1.75, y = -.25, cex = 1, legend = as.character(levels(species_labels)), fill = unique(species_col), horiz = TRUE) par(xpd = NA) ## ----fig.height = 10, fig.width=7--------------------------------------------- d_iris <- dist(iris2) # method="man" # is a bit better hc_iris <- hclust(d_iris, method = "complete") iris_species <- rev(levels(iris[,5])) library(dendextend) dend <- as.dendrogram(hc_iris) # order it the closest we can to the order of the observations: dend <- rotate(dend, 1:150) # Color the branches based on the clusters: dend <- color_branches(dend, k=3) #, groupLabels=iris_species) # Manually match the labels, as much as possible, to the real classification of the flowers: labels_colors(dend) <- rainbow_hcl(3)[sort_levels_values( as.numeric(iris[,5])[order.dendrogram(dend)] )] # We shall add the flower type to the labels: labels(dend) <- paste(as.character(iris[,5])[order.dendrogram(dend)], "(",labels(dend),")", sep = "") # We hang the dendrogram a bit: dend <- hang.dendrogram(dend,hang_height=0.1) # reduce the size of the labels: # dend <- assign_values_to_leaves_nodePar(dend, 0.5, "lab.cex") dend <- set(dend, "labels_cex", 0.5) # And plot: par(mar = c(3,3,3,7)) plot(dend, main = "Clustered Iris data set (the labels give the true flower species)", horiz = TRUE, nodePar = list(cex = .007)) legend("topleft", legend = iris_species, fill = rainbow_hcl(3)) #### BTW, notice that: # labels(hc_iris) # no labels, because "iris" has no row names # is.integer(labels(dend)) # this could cause problems... # is.character(labels(dend)) # labels are no longer "integer" ## ----fig.width=7, fig.height=7------------------------------------------------ # Requires that the circlize package will be installed par(mar = rep(0,4)) circlize_dendrogram(dend) ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # # some_col_func <- function(n, top_color = "red4") { # # seq_cols <- c("#F7FCFD", "#E0ECF4", "#BFD3E6", "#9EBCDA", "#8C96C6", "#8C6BB1", # # "#88419D", "#810F7C") # # c(colorRampPalette(seq_cols, bias =1)(n-1), top_color) # # } # ## ----fig.width=9, fig.height=9------------------------------------------------ some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5))) # scaled_iris2 <- iris2 %>% as.matrix %>% scale # library(gplots) gplots::heatmap.2(as.matrix(iris2), main = "Heatmap for the Iris data set", srtCol = 20, dendrogram = "row", Rowv = dend, Colv = "NA", # this to make sure the columns are not ordered trace="none", margins =c(5,0.1), key.xlab = "Cm", denscol = "grey", density.info = "density", RowSideColors = rev(labels_colors(dend)), # to add nice colored strips col = some_col_func ) ## ----cache = FALSE, eval = FALSE---------------------------------------------- # heatmaply::heatmaply(as.matrix(iris2), # dendrogram = "row", # Rowv = dend) ## ----------------------------------------------------------------------------- hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty", "median", "centroid", "ward.D2") iris_dendlist <- dendlist() for(i in seq_along(hclust_methods)) { hc_iris <- hclust(d_iris, method = hclust_methods[i]) iris_dendlist <- dendlist(iris_dendlist, as.dendrogram(hc_iris)) } names(iris_dendlist) <- hclust_methods iris_dendlist ## ----fig.width=8, fig.height=8------------------------------------------------ iris_dendlist_cor <- cor.dendlist(iris_dendlist) iris_dendlist_cor corrplot::corrplot(iris_dendlist_cor, "pie", "lower") ## ----fig.width=8, fig.height=8------------------------------------------------ iris_dendlist_cor_spearman <- cor.dendlist(iris_dendlist, method_coef = "spearman") corrplot::corrplot(iris_dendlist_cor_spearman, "pie", "lower") ## ----fig.height=5------------------------------------------------------------- # The `which` parameter allows us to pick the elements in the list to compare iris_dendlist %>% dendlist(which = c(1,8)) %>% ladderize %>% set("branches_k_color", k=3) %>% # untangle(method = "step1side", k_seq = 3:20) %>% # set("clear_branches") %>% #otherwise the single lines are not black, since they retain the previous color from the branches_k_color. tanglegram(faster = TRUE) # (common_subtrees_color_branches = TRUE) ## ----fig.height=5------------------------------------------------------------- # The `which` parameter allows us to pick the elements in the list to compare iris_dendlist %>% dendlist(which = c(1,4)) %>% ladderize %>% set("branches_k_color", k=2) %>% # untangle(method = "step1side", k_seq = 3:20) %>% tanglegram(faster = TRUE) # (common_subtrees_color_branches = TRUE) ## ----fig.height=5------------------------------------------------------------- # The `which` parameter allows us to pick the elements in the list to compare iris_dendlist %>% dendlist(which = c(1,4)) %>% ladderize %>% # untangle(method = "step1side", k_seq = 3:20) %>% set("rank_branches") %>% tanglegram(common_subtrees_color_branches = TRUE) ## ----------------------------------------------------------------------------- length(unique(common_subtrees_clusters(iris_dendlist[[1]], iris_dendlist[[4]]))[-1]) # -1 at the end is because we are ignoring the "0" subtree, which indicates leaves that are singletons. ## ----fig.height=5------------------------------------------------------------- iris_dendlist %>% dendlist(which = c(3,4)) %>% ladderize %>% untangle(method = "step1side", k_seq = 2:6) %>% set("branches_k_color", k=2) %>% tanglegram(faster = TRUE) # (common_subtrees_color_branches = TRUE) ## ----fig.height=15------------------------------------------------------------ par(mfrow = c(4,2)) for(i in 1:8) { iris_dendlist[[i]] %>% set("branches_k_color", k=2) %>% plot(axes = FALSE, horiz = TRUE) title(names(iris_dendlist)[i]) } ## ----------------------------------------------------------------------------- iris_dendlist_cor2 <- cor.dendlist(iris_dendlist, method = "common") iris_dendlist_cor2 ## ----fig.width=5, fig.height=5------------------------------------------------ # corrplot::corrplot(iris_dendlist_cor2, "pie", "lower") ## ----------------------------------------------------------------------------- get_ordered_3_clusters <- function(dend) { cutree(dend, k = 3)[order.dendrogram(dend)] } dend_3_clusters <- lapply(iris_dendlist, get_ordered_3_clusters) compare_clusters_to_iris <- function(clus) {FM_index(clus, rep(1:3, each = 50), assume_sorted_vectors = TRUE)} clusters_performance <- sapply(dend_3_clusters, compare_clusters_to_iris) dotchart(sort(clusters_performance), xlim = c(0.7,1), xlab = "Fowlkes-Mallows Index (from 0 to 1)", main = "Perormance of clustering algorithms \n in detecting the 3 species", pch = 19) ## ----------------------------------------------------------------------------- train <- dendextend::khan$train test <- dendextend::khan$test ## ----------------------------------------------------------------------------- d_train <- train %>% dist %>% hclust %>% as.dendrogram d_test <- test %>% dist %>% hclust %>% as.dendrogram d_train_test <- dendlist(train = d_train, test = d_test) ## ----------------------------------------------------------------------------- d_train_test %>% cor.dendlist ## ----------------------------------------------------------------------------- d_train_test %>% cor.dendlist(method_coef = "spearman") ## ----------------------------------------------------------------------------- Bk_plot(d_train, d_test, k = 2:30, xlim = c(2,30)) ## ----fig.width=8, fig.height=5------------------------------------------------ pre_tang_d_train_test <- d_train_test %>% ladderize %>% # untangle %>% set("branches_k_color", k = 7) train_branches_colors <- get_leaves_branches_col(pre_tang_d_train_test$train) pre_tang_d_train_test %>% tanglegram(fast = TRUE, color_lines = train_branches_colors) ## ----echo = FALSE------------------------------------------------------------- # dput(d_train_test_common) d_train_test_common <- structure(list(train = structure(list(structure(list(structure(171L, label = "491565", members = 1L, height = 0, leaf = TRUE), structure(178L, label = "505491", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 7.1369942952198), structure(list(structure(list(structure(8L, label = "283315", members = 1L, height = 0, leaf = TRUE), structure(9L, label = "897177", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 2.55936539399907), structure(list(structure(list(structure(106L, label = "345553", members = 1L, height = 0, leaf = TRUE), structure(112L, label = "307660", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 5.17910461856101), structure(list(structure(list(structure(268L, label = "504791", members = 1L, height = 0, leaf = TRUE), structure(306L, label = "782503", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 4.27052507661529), structure(list(structure(list(structure(246L, label = "81518", members = 1L, height = 0, leaf = TRUE), structure(290L, label = "280837", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.37572388944875), structure(list(structure(list(structure(266L, label = "866694", members = 1L, height = 0, leaf = TRUE), structure(277L, label = "811956", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 3.31301518861595), structure(list(structure(273L, label = "842918", members = 1L, height = 0, leaf = TRUE), structure(274L, label = "626555", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 2.71864544948399)), members = 4, midpoint = 1.5, height = 6.35097701381449)), members = 6, midpoint = 2, height = 8.7097033164167)), members = 8, midpoint = 2.25, height = 9.23807936424017)), members = 10, midpoint = 2.375, height = 11.6573350998416)), members = 12, midpoint = 2.4375, height = 17.5620766260713)), members = 14, midpoint = 2.46875, height = 30.2363452779928, class = "dendrogram"), test = structure(list(structure(list(structure(list(structure(171L, label = "491565", members = 1L, height = 0, leaf = TRUE), structure(178L, label = "505491", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 3.96666017450449), structure(list(structure(list(structure(list(structure(268L, label = "504791", members = 1L, height = 0, leaf = TRUE), structure(306L, label = "782503", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 2.31497882927685), structure(list(structure(list(structure(266L, label = "866694", members = 1L, height = 0, leaf = TRUE), structure(277L, label = "811956", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.75475236429532), structure(list(structure(273L, label = "842918", members = 1L, height = 0, leaf = TRUE), structure(274L, label = "626555", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.34617375921535)), members = 4, midpoint = 1.5, height = 2.76465021476497)), members = 6, midpoint = 2, height = 4.52927251774499), structure(list(structure(list(structure(246L, label = "81518", members = 1L, height = 0, leaf = TRUE), structure(290L, label = "280837", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 0.714433271901582), structure(list(structure(8L, label = "283315", members = 1L, height = 0, leaf = TRUE), structure(9L, label = "897177", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.71895552589356)), members = 4, midpoint = 1.5, height = 6.44143803354499)), members = 10, midpoint = 4.75, height = 7.736516720075)), members = 12, midpoint = 3.625, height = 11.0066972375913), structure(list(structure(106L, label = "345553", members = 1L, height = 0, leaf = TRUE), structure(112L, label = "307660", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 3.6486307417989)), members = 14, midpoint = 8.0625, height = 18.2331742971431, class = "dendrogram")), class = "dendlist", .Names = c("train", "test")) ## ----------------------------------------------------------------------------- # This was calculated before # d_train_test_common <- d_train_test %>% prune_common_subtrees.dendlist # d_train_test_common d_train_test_common %>% untangle %>% tanglegram(common_subtrees_color_branches = TRUE) ## ----------------------------------------------------------------------------- d_train_test %>% nleaves d_train_test_common %>% nleaves ## ----------------------------------------------------------------------------- votes.repub <- cluster::votes.repub ## ----fig.height=5------------------------------------------------------------- years <- as.numeric(gsub("X", "", colnames(votes.repub))) par(las = 2, mar = c(4.5, 3, 3, 2) + 0.1, cex = .8) # MASS::parcoord(votes.repub, var.label = FALSE, lwd = 1) matplot(1L:ncol(votes.repub), t(votes.repub), type = "l", col = 1, lty = 1, axes = F, xlab = "", ylab = "") axis(1, at = seq_along(years), labels = years) axis(2) # Add Title title("Votes for Republican Candidate\n in Presidential Elections \n (each line is a country - over the years)") ## ----fig.width=9, fig.height=9------------------------------------------------ arcsin_transformation <- function(x) asin(x/100) dend_NA <- votes.repub %>% is.na %>% dist %>% hclust %>% as.dendrogram %>% ladderize dend <- votes.repub %>% arcsin_transformation %>% dist %>% hclust(method = "com") %>% as.dendrogram %>% rotate(labels(dend_NA)) %>% color_branches(k=3) # some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5))) some_col_func <- colorspace::diverge_hcl # par(mar = c(3,3,3,3)) # library(gplots) gplots::heatmap.2(as.matrix(votes.repub), main = "Votes for\n Republican Presidential Candidate\n (clustered using complete)", srtCol = 60, dendrogram = "row", Rowv = dend, Colv = "NA", # this to make sure the columns are not ordered trace="none", margins =c(3,6), key.xlab = "% Votes for Republican\n Presidential Candidate", labCol = years, denscol = "grey", density.info = "density", col = some_col_func ) # RowSideColors = rev(labels_colors(dend)), # to add nice colored strips ## ----------------------------------------------------------------------------- hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty", "median", "centroid", "ward.D2") votes.repub_dendlist <- dendlist() for(i in seq_along(hclust_methods)) { tmp_dend <- votes.repub %>% arcsin_transformation %>% dist %>% hclust(method = hclust_methods[i]) %>% as.dendrogram votes.repub_dendlist <- dendlist(votes.repub_dendlist, tmp_dend) } names(votes.repub_dendlist) <- hclust_methods # votes.repub_dendlist ## ----fig.width=8, fig.height=8------------------------------------------------ corrplot::corrplot(cor.dendlist(votes.repub_dendlist), "pie", "lower") ## ----echo=FALSE, fig.width=9, fig.height=9------------------------------------ arcsin_transformation <- function(x) asin(x/100) dend_NA <- votes.repub %>% is.na %>% dist %>% hclust %>% as.dendrogram %>% ladderize dend <- votes.repub %>% arcsin_transformation %>% dist %>% hclust(method = "ave") %>% as.dendrogram %>% rotate(labels(dend_NA)) %>% color_branches(k=3) # some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5))) some_col_func <- colorspace::diverge_hcl # par(mar = c(3,3,3,3)) # library(gplots) gplots::heatmap.2(as.matrix(votes.repub), main = "Votes for\n Republican Presidential Candidate\n (clustered using average)", srtCol = 60, dendrogram = "row", Rowv = dend, Colv = "NA", # this to make sure the columns are not ordered trace="none", margins =c(3,6), key.xlab = "% Votes for Republican\n Presidential Candidate", labCol = years, denscol = "grey", density.info = "density", col = some_col_func ) # RowSideColors = rev(labels_colors(dend)), # to add nice colored strips ## ----echo=FALSE--------------------------------------------------------------- ord1 <- c("North Carolina", "Virginia", "Tennessee", "Kentucky", "Maryland", "Delaware", "Oklahoma", "Missouri", "New Mexico", "Oregon", "Washington", "California", "West Virginia", "Hawaii", "Nevada", "Arizona", "Montana", "Idaho", "Wyoming", "Utah", "Colorado", "Alaska", "Illinois", "New York", "Indiana", "Ohio", "Connecticut", "New Hampshire", "New Jersey", "Pennsylvania", "Iowa", "South Dakota", "North Dakota", "Wisconsin", "Minnesota", "Nebraska", "Kansas", "Maine", "Michigan", "Massachusetts", "Rhode Island", "Vermont", "Alabama", "Georgia", "Louisiana", "Arkansas", "Florida", "Texas", "South Carolina", "Mississippi") ord2 <- c("North Carolina", "Virginia", "Tennessee", "Oklahoma", "Kentucky", "Maryland", "Delaware", "Missouri", "New Mexico", "West Virginia", "Oregon", "Washington", "California", "Nevada", "Arizona", "Montana", "Colorado", "Alaska", "Idaho", "Wyoming", "Utah", "Hawaii", "Maine", "Illinois", "New York", "New Jersey", "Indiana", "Ohio", "Connecticut", "New Hampshire", "Pennsylvania", "Michigan", "Iowa", "South Dakota", "North Dakota", "Wisconsin", "Minnesota", "Massachusetts", "Rhode Island", "Nebraska", "Kansas", "Vermont", "Alabama", "Georgia", "Louisiana", "Arkansas", "Florida", "Texas", "South Carolina", "Mississippi" ) # dput(lapply(dends, labels)[[2]]) ## ----------------------------------------------------------------------------- dend_com <- votes.repub %>% arcsin_transformation %>% dist %>% hclust(method = "com") %>% as.dendrogram %>% rotate(labels(dend_NA)) %>% color_branches(k=3) # %>% ladderize dend_ave <- votes.repub %>% arcsin_transformation %>% dist %>% hclust(method = "ave") %>% as.dendrogram %>% rotate(labels(dend_NA)) %>% color_branches(k=3) # %>% ladderize # The orders were predefined after using untangle("step2side") # They are omitted here to save running time. dend_com <- rotate(dend_com, ord1) dend_ave <- rotate(dend_ave, ord2) dends <- dendlist(complete = dend_com, average = dend_ave) # %>% untangle("step2side") dends %>% tanglegram(margin_inner = 7) ## ----------------------------------------------------------------------------- animals <- cluster::animals colnames(animals) <- c("warm-blooded", "can fly", "vertebrate", "endangered", "live in groups", "have hair") ## ----fig.width=9, fig.height=9------------------------------------------------ dend_r <- animals %>% dist(method = "man") %>% hclust(method = "ward.D") %>% as.dendrogram %>% ladderize %>% color_branches(k=4) dend_c <- t(animals) %>% dist(method = "man") %>% hclust(method = "com") %>% as.dendrogram %>% ladderize%>% color_branches(k=3) # some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5))) # some_col_func <- colorspace::diverge_hcl # some_col_func <- colorspace::sequential_hcl some_col_func <- function(n) (colorspace::diverge_hcl(n, h = c(246, 40), c = 96, l = c(65, 90))) # par(mar = c(3,3,3,3)) # library(gplots) gplots::heatmap.2(as.matrix(animals-1), main = "Attributes of Animals", srtCol = 35, Rowv = dend_r, Colv = dend_c, trace="row", hline = NA, tracecol = "darkgrey", margins =c(6,3), key.xlab = "no / yes", denscol = "grey", density.info = "density", col = some_col_func ) ## ----------------------------------------------------------------------------- hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty", "median", "centroid", "ward.D2") animals_dendlist <- dendlist() for(i in seq_along(hclust_methods)) { tmp_dend <- animals %>% dist(method = "man") %>% hclust(method = hclust_methods[i]) %>% as.dendrogram animals_dendlist <- dendlist(animals_dendlist, tmp_dend) } names(animals_dendlist) <- hclust_methods # votes.repub_dendlist ## ----fig.width=8, fig.height=8------------------------------------------------ cophenetic_cors <- cor.dendlist(animals_dendlist) corrplot::corrplot(cophenetic_cors, "pie", "lower") ## ----------------------------------------------------------------------------- remove_median <- dendlist(animals_dendlist, which = c(1:8)[-6] ) FM_cors <- cor.dendlist(remove_median, method = "FM_index", k = 4) corrplot::corrplot(FM_cors, "pie", "lower")