Title: | A Shiny App for Visual Exploration of Hierarchical Clustering |
---|---|
Description: | A Shiny application and functions for visual exploration of hierarchical clustering with numeric datasets. Allows users to iterative set hyperparameters, select features and evaluate results through various plots and computation of evaluation criteria. |
Authors: | Rafael Henkin [aut, cre] |
Maintainer: | Rafael Henkin <[email protected]> |
License: | GPL-3 |
Version: | 1.1.0.9000 |
Built: | 2025-01-28 04:10:09 UTC |
Source: | https://github.com/rhenkin/visxhclust |
Annotate data frame with clusters
annotate_clusters(df, cluster_labels, long = TRUE, selected_clusters = NULL)
annotate_clusters(df, cluster_labels, long = TRUE, selected_clusters = NULL)
df |
a data frame |
cluster_labels |
list of cluster labels, automatically converted to factor. |
long |
if |
selected_clusters |
optional cluster labels to filter |
Long data frame will have columns: Cluster
, Measurement
and Value
.
a wide or long data frame
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) res <- compute_clusters(dmat, "complete") cluster_labels <- cut_clusters(res, 2) annotated_data <- annotate_clusters(iris[, c("Petal.Length", "Sepal.Length")], cluster_labels) head(annotated_data)
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) res <- compute_clusters(dmat, "complete") cluster_labels <- cut_clusters(res, 2) annotated_data <- annotate_clusters(iris[, c("Petal.Length", "Sepal.Length")], cluster_labels) head(annotated_data)
Simulated binary data
bin_df
bin_df
A data frame with 200 rows and 10 variables:
variable a
variable b
variable c
variable d
variable e
variable f
variable g
variable h
variable i
variable j
package author
This is a convenience wrapper function for facet_boxplot()
.
Combined with annotate_clusters()
, it
doesn't require specifying axes in facet_boxplot()
.
cluster_boxplots(annotated_data, ...)
cluster_boxplots(annotated_data, ...)
annotated_data |
data frame returned by |
... |
arguments passed to |
boxplots faceted by clusters
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) clusters <- compute_clusters(dmat, "complete") cluster_labels <- cut_clusters(clusters, 2) annotated_data <- annotate_clusters(iris[, c("Petal.Length", "Sepal.Length")], cluster_labels) cluster_boxplots(annotated_data, boxplot_colors = visxhclust::cluster_colors)
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) clusters <- compute_clusters(dmat, "complete") cluster_labels <- cut_clusters(clusters, 2) annotated_data <- annotate_clusters(iris[, c("Petal.Length", "Sepal.Length")], cluster_labels) cluster_boxplots(annotated_data, boxplot_colors = visxhclust::cluster_colors)
List of colors used in the Shiny app for clusters
cluster_colors
cluster_colors
An object of class character
of length 39.
Plot heatmap with cluster results and dendrogram
cluster_heatmaps( scaled_selected_data, clusters, k, cluster_colors, scaled_unselected_data = NULL, annotation = NULL )
cluster_heatmaps( scaled_selected_data, clusters, k, cluster_colors, scaled_unselected_data = NULL, annotation = NULL )
scaled_selected_data |
scaled matrix or data frame with variables used for clustering |
clusters |
hierarchical cluster results produced by |
k |
targeted number of clusters |
cluster_colors |
list of cluster colors to match with boxplots |
scaled_unselected_data |
(optional) scaled matrix or data frame with variables not used for clustering |
annotation |
(optional) ComplexHeatmap::columnAnnotation object |
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) clusters <- compute_clusters(dmat, "complete") species_annotation <- create_annotations(iris, "Species") cluster_heatmaps(scale(iris[c("Petal.Length", "Sepal.Length")]), clusters, 3, visxhclust::cluster_colors, annotation = species_annotation)
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) clusters <- compute_clusters(dmat, "complete") species_annotation <- create_annotations(iris, "Species") cluster_heatmaps(scale(iris[c("Petal.Length", "Sepal.Length")]), clusters, 3, visxhclust::cluster_colors, annotation = species_annotation)
Compute clusters hierarchically from distance matrix
compute_clusters(dmat, linkage_method)
compute_clusters(dmat, linkage_method)
dmat |
a distance matrix |
linkage_method |
a linkage method supported by |
clusters computed by fastcluster::hclust()
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) res <- compute_clusters(dmat, "complete")
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) res <- compute_clusters(dmat, "complete")
This function applies scaling to the columns of a data frame and computes and returns a distance matrix from a chosen distance measure.
compute_dmat( x, dist_method = "euclidean", apply_scaling = FALSE, subset_cols = NULL )
compute_dmat( x, dist_method = "euclidean", apply_scaling = FALSE, subset_cols = NULL )
x |
a numeric data frame or matrix |
dist_method |
a distance measure to apply to the scaled data. Must be those supported by |
apply_scaling |
use TRUE to apply |
subset_cols |
(optional) a list of columns to subset the data |
an object of class "dist" (see stats::dist()
)
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) print(class(dmat))
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) print(class(dmat))
Compute Gap statistic for clustered data
compute_gapstat(df, clusters, gap_B = 50, max_k = 14)
compute_gapstat(df, clusters, gap_B = 50, max_k = 14)
df |
the data used to compute clusters |
clusters |
output of |
gap_B |
number of bootstrap samples for |
max_k |
maximum number of clusters to compute the statistic. Default is 14. |
a data frame with the Tab component of cluster::clusGap()
results
data_to_cluster <- iris[c("Petal.Length", "Sepal.Length")] dmat <- compute_dmat(data_to_cluster, "euclidean", TRUE) clusters <- compute_clusters(dmat, "complete") gap_results <- compute_gapstat(scale(data_to_cluster), clusters) head(gap_results)
data_to_cluster <- iris[c("Petal.Length", "Sepal.Length")] dmat <- compute_dmat(data_to_cluster, "euclidean", TRUE) clusters <- compute_clusters(dmat, "complete") gap_results <- compute_gapstat(scale(data_to_cluster), clusters) head(gap_results)
Metric will be computed from 2 to max_k clusters. Note that the row number in results will be different from k.
compute_metric(dmat, clusters, metric_name, max_k = 14)
compute_metric(dmat, clusters, metric_name, max_k = 14)
dmat |
distance matrix output of |
clusters |
output of |
metric_name |
"silhouette" or "dunn" |
max_k |
maximum number of clusters to cut using |
a data frame with columns k
and score
data_to_cluster <- iris[c("Petal.Length", "Sepal.Length")] dmat <- compute_dmat(data_to_cluster, "euclidean", TRUE) clusters <- compute_clusters(dmat, "complete") compute_metric(dmat, clusters, "dunn")
data_to_cluster <- iris[c("Petal.Length", "Sepal.Length")] dmat <- compute_dmat(data_to_cluster, "euclidean", TRUE) clusters <- compute_clusters(dmat, "complete") compute_metric(dmat, clusters, "dunn")
Computes pairwise Pearson correlation; if there are fewer than 15 columns, prints the value of the correlation coefficient inside each tile.
correlation_heatmap(df)
correlation_heatmap(df)
df |
numeric data frame to compute correlations |
This function will create a ComplexHeatmap::columnAnnotation object with rows
for each variable passed as argument. Character columns will be coerced into factors.
For factors, the ColorBrewer palette Set3
will be used. For non-negative numeric, the
PuBu
palette will be used, and for columns with negative values, the reversed RdBu
will be used.
create_annotations(df, selected_variables)
create_annotations(df, selected_variables)
df |
a data frame. It can be an original unscaled data, or a scaled one |
selected_variables |
list of columns in the data frame to create annotations for |
a ComplexHeatmap::columnAnnotation object
Cut a hierarchical tree targeting k clusters
cut_clusters(clusters, k)
cut_clusters(clusters, k)
clusters |
cluster results, produced by e.g. |
k |
target number of clusters |
cluster labels
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) clusters <- compute_clusters(dmat, "complete") cluster_labels <- cut_clusters(clusters, 2) head(cluster_labels)
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) clusters <- compute_clusters(dmat, "complete") cluster_labels <- cut_clusters(clusters, 2) head(cluster_labels)
Plot a 2D MDS projection of a distance matrix
dmat_projection(dmat, point_colors = NULL, point_palette = NULL)
dmat_projection(dmat, point_colors = NULL, point_palette = NULL)
dmat |
distance matrix |
point_colors |
optional list of labels to color points (will be coerced to factor) |
point_palette |
optional palette used with |
a ggplot object
dmat <- dist(iris[, c("Sepal.Width", "Sepal.Length")]) dmat_projection(dmat)
dmat <- dist(iris[, c("Sepal.Width", "Sepal.Length")]) dmat_projection(dmat)
Faceted boxplots with points or violin plots
facet_boxplot( df, x, y, facet_var = NULL, boxplot_colors = NULL, shape = c("boxplot", "violin"), plot_points = TRUE )
facet_boxplot( df, x, y, facet_var = NULL, boxplot_colors = NULL, shape = c("boxplot", "violin"), plot_points = TRUE )
df |
a data frame containing all the variables matching the remaining arguments |
x |
categorical variable |
y |
continuous variable |
facet_var |
optional variable to facet data |
boxplot_colors |
list of colors to use as fill for boxplots |
shape |
either "boxplot" or "violin" |
plot_points |
boolean variable to overlay jittered points or not. Default is |
a ggplot2::ggplot object
facet_boxplot(iris, x = "Species", y = "Sepal.Length", facet_var = "Species")
facet_boxplot(iris, x = "Species", y = "Sepal.Length", facet_var = "Species")
A custom line plot with optional vertical line
line_plot(df, x, y, xintercept = NULL)
line_plot(df, x, y, xintercept = NULL)
df |
data source |
x |
variable for horizontal axis |
y |
variable for vertical axis |
xintercept |
optional value in horizontal axis to highlight |
a ggplot2::ggplot object
Simulated logscaled data
logscaled_df
logscaled_df
A data frame with 200 rows and 10 variables:
variable a
variable b
variable c
variable d
variable e
variable f
variable g
variable h
variable i
variable j
package author
Simulated normal data with annotations
normal_annotated
normal_annotated
A data frame with 200 rows and 10 variables:
variable a
variable b
variable c
variable d
variable e
variable f
variable g
variable h
variable i
variable j
annotation column
package author
Simulated normal data
normal_df
normal_df
A data frame with 200 rows and 10 variables:
variable a
variable b
variable c
variable d
variable e
variable f
variable g
variable h
variable i
variable j
package author
Simulated normal data with missing values
normal_missing
normal_missing
A data frame with 200 rows and 10 variables:
variable a
variable b
variable c
variable d
variable e
variable f
variable g
variable h
variable i
variable with randomly missing values
package author
This function is meant to be used with compute_metric. For Gap statistic,
use cluster::maxSE()
.
optimal_score(x, method = c("firstmax", "globalmax", "firstmin", "globalmin"))
optimal_score(x, method = c("firstmax", "globalmax", "firstmin", "globalmin"))
x |
a numeric vector |
method |
one of "firstmax", "globalmax", "firstmin" or "globalmin" |
the index (not k) of the identified maximum or minimum score
data_to_cluster <- iris[c("Petal.Length", "Sepal.Length")] dmat <- compute_dmat(data_to_cluster, "euclidean", TRUE) clusters <- compute_clusters(dmat, "complete") res <- compute_metric(dmat, clusters, "dunn") optimal_score(res$score, method = "firstmax")
data_to_cluster <- iris[c("Petal.Length", "Sepal.Length")] dmat <- compute_dmat(data_to_cluster, "euclidean", TRUE) clusters <- compute_clusters(dmat, "complete") res <- compute_metric(dmat, clusters, "dunn") optimal_score(res$score, method = "firstmax")
Plot distribution of annotation data across clusters
plot_annotation_dist(annotations_df, cluster_labels, selected_clusters = NULL)
plot_annotation_dist(annotations_df, cluster_labels, selected_clusters = NULL)
annotations_df |
data frame with variables not used in clustering |
cluster_labels |
output from |
selected_clusters |
optional vector of cluster labels to include in plots |
a patchwork
object
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) clusters <- compute_clusters(dmat, "complete") cluster_labels <- cut_clusters(clusters, 2) plot_annotation_dist(iris["Species"], cluster_labels)
dmat <- compute_dmat(iris, "euclidean", TRUE, c("Petal.Length", "Sepal.Length")) clusters <- compute_clusters(dmat, "complete") cluster_labels <- cut_clusters(clusters, 2) plot_annotation_dist(iris["Species"], cluster_labels)
Runs the Shiny app
run_app()
run_app()
No return value, runs the app by passing it to print
## Only run this example in interactive R sessions if (interactive()) { library(visxhclust) run_app() }
## Only run this example in interactive R sessions if (interactive()) { library(visxhclust) run_app() }