From boxplots to interactive volcano plots — build every figure a bioinformatics paper needs, with the grammar of graphics and modern R packages.
ggplot2 is built on Leland Wilkinson's "Grammar of Graphics" — the idea that every statistical graphic is composed of layers. Understanding these layers gives you total control over your figures:
aes) — mappings from data columns to visual properties (x, y, color, size, shape)geom_*) — the visual marks (points, lines, bars, boxplots)
tidyr::pivot_longer() to reshape wide data before plotting. If your data isn't tidy, your ggplot code will fight you every step of the way.
install.packages(c("tidyverse", "pheatmap", "RColorBrewer",
"plotly", "ggrepel", "patchwork"))
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(c("EnhancedVolcano", "ComplexHeatmap"))
library(tidyverse) # includes ggplot2, dplyr, tidyr, readr
library(pheatmap)
library(RColorBrewer)
library(plotly)
library(ggrepel)
library(patchwork) # combine multiple ggplots
library(EnhancedVolcano)
Boxplots summarize distribution with median, quartiles, and outliers. Violin plots add a kernel density estimate, showing the full distribution shape. Combining both gives the most informative view.
# Simulate some gene expression data across organisms and gene families
set.seed(42)
expr_data <- expand.grid(
organism = c("Human", "Mouse", "Zebrafish"),
family = paste0("GeneFamily_", LETTERS[1:4]),
rep = 1:20
) %>%
mutate(
expression = rnorm(n(), mean = 500, sd = 150) +
as.numeric(factor(organism)) * 50 +
as.numeric(factor(family)) * 30
)
# Simple boxplot
ggplot(expr_data, aes(x = organism, y = expression)) +
geom_boxplot() +
theme_minimal(base_size = 14) +
labs(title = "Gene Expression by Organism",
x = "Organism", y = "Expression (counts)")
ggplot(expr_data, aes(x = organism, y = expression, fill = organism)) +
geom_boxplot(outlier.shape = 21, outlier.size = 2, alpha = 0.8) +
facet_wrap(~ family, scales = "free_y") +
scale_fill_brewer(palette = "Set2") +
theme_minimal(base_size = 13) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
) +
labs(title = "Expression Distribution by Gene Family",
x = NULL, y = "Expression")
# The most informative single-plot approach
ggplot(expr_data, aes(x = organism, y = expression, fill = organism)) +
geom_violin(alpha = 0.4, width = 0.9) + # distribution shape
geom_boxplot(width = 0.15, outlier.shape = NA) + # summary stats
geom_jitter(width = 0.05, alpha = 0.3, size = 1) + # individual points
scale_fill_brewer(palette = "Pastel1") +
theme_minimal(base_size = 14) +
theme(legend.position = "none") +
labs(title = "Violin + Boxplot + Jitter",
subtitle = "Shows distribution, summary, and individual observations",
x = NULL, y = "Expression")
ggbeeswarm) — like jitter but non-overlapping; elegant for small nPCA (Principal Component Analysis) reduces high-dimensional expression data to 2D for visualization. It reveals global patterns: do samples cluster by condition? Is there a batch effect?
# Assuming you have a DESeq2 vsd object from the RNA-Seq tutorial
# vsd <- vst(dds, blind = FALSE)
# Method 1: DESeq2 built-in (quick)
plotPCA(vsd, intgroup = "condition") +
theme_minimal(base_size = 14)
# Method 2: Custom PCA with more control
pca_data <- prcomp(t(assay(vsd)), center = TRUE, scale. = TRUE)
pct_var <- round(100 * summary(pca_data)$importance[2, 1:2], 1)
pca_df <- data.frame(
PC1 = pca_data$x[, 1],
PC2 = pca_data$x[, 2],
condition = colData(vsd)$condition,
sample = colnames(vsd)
)
ggplot(pca_df, aes(x = PC1, y = PC2, color = condition)) +
geom_point(size = 4, alpha = 0.9) +
geom_text_repel(aes(label = sample), size = 3, max.overlaps = 10) +
stat_ellipse(type = "norm", linetype = 2, level = 0.95) +
scale_color_brewer(palette = "Set1") +
theme_minimal(base_size = 14) +
labs(
title = "PCA of RNA-Seq Samples",
x = paste0("PC1 (", pct_var[1], "% variance)"),
y = paste0("PC2 (", pct_var[2], "% variance)")
)
The volcano plot is the signature visualization for differential expression results. X-axis = effect size (log2FC), Y-axis = significance (-log10 padj).
# Assuming res_df from DESeq2 tutorial (gene, log2FoldChange, padj columns)
res_df <- res_df %>%
mutate(
significance = case_when(
padj < 0.05 & log2FoldChange > 1 ~ "Up",
padj < 0.05 & log2FoldChange < -1 ~ "Down",
TRUE ~ "NS"
)
)
# Count genes in each category for the subtitle
up_n <- sum(res_df$significance == "Up", na.rm = TRUE)
down_n <- sum(res_df$significance == "Down", na.rm = TRUE)
ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = significance)) +
geom_point(alpha = 0.5, size = 1.5) +
scale_color_manual(values = c("Up" = "#e74c3c", "Down" = "#3498db", "NS" = "grey70")) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "grey40") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey40") +
geom_text_repel(
data = res_df %>% filter(padj < 1e-10, abs(log2FoldChange) > 2) %>% head(15),
aes(label = gene), size = 3, max.overlaps = 15, color = "black"
) +
theme_minimal(base_size = 14) +
labs(
title = "Volcano Plot: Treated vs Control",
subtitle = paste0(up_n, " upregulated, ", down_n, " downregulated (padj<0.05, |LFC|>1)"),
x = expression(log[2]~"Fold Change"),
y = expression(-log[10]~"adjusted p-value"),
color = "Status"
)
library(EnhancedVolcano) EnhancedVolcano(res_df, lab = res_df$gene, x = 'log2FoldChange', y = 'padj', pCutoff = 0.05, FCcutoff = 1.0, title = 'Treated vs Control', subtitle = 'Shrunken log2FC, BH-adjusted p-values', pointSize = 2.5, labSize = 3.5, drawConnectors = TRUE)
MA plots show log2FC (M) vs. mean expression (A). They reveal whether fold changes are biased by expression level.
ggplot(res_df, aes(x = log10(baseMean + 1), y = log2FoldChange, color = significance)) +
geom_point(alpha = 0.4, size = 1) +
scale_color_manual(values = c("Up" = "#e74c3c", "Down" = "#3498db", "NS" = "grey75")) +
geom_hline(yintercept = 0, color = "black", linewidth = 0.5) +
theme_minimal(base_size = 14) +
labs(
title = "MA Plot",
x = expression(log[10]~"Mean Expression"),
y = expression(log[2]~"Fold Change"),
color = NULL
)
Heatmaps are essential for showing patterns across many genes and samples simultaneously. We cover three tools: base R heatmap(), pheatmap, and ComplexHeatmap.
library(pheatmap)
# Top 50 DE genes, variance-stabilized expression, row-centered
top_genes <- res_df %>% filter(padj < 0.05) %>% arrange(padj) %>% head(50) %>% pull(gene)
mat <- assay(vsd)[top_genes, ]
mat <- mat - rowMeans(mat) # Center each gene to mean 0
# Annotation bar for columns
anno_col <- data.frame(
row.names = colnames(mat),
Condition = colData(vsd)$condition
)
pheatmap(
mat,
scale = "none", # already centered
cluster_rows = TRUE,
cluster_cols = TRUE,
clustering_method = "ward.D2", # compact clusters
clustering_distance_rows = "correlation",
show_rownames = TRUE,
show_colnames = TRUE,
annotation_col = anno_col,
color = colorRampPalette(c("#2166AC", "white", "#B2182B"))(100),
border_color = NA,
fontsize_row = 7,
main = "Top 50 DE Genes (Row-Centered)"
)
| Parameter | Options & When to Use |
|---|---|
scale | "none" if pre-centered. "row" to z-score normalize per gene. "column" rarely used. |
clustering_method | "ward.D2" (tight clusters), "complete" (default), "average" (UPGMA — used in phylogenetics) |
clustering_distance_rows | "correlation" for expression (groups co-expressed genes). "euclidean" for magnitude-based clustering. |
color | Diverging palette (blue-white-red) for centered data. Sequential palette for raw values. Never use rainbow. |
cutree_rows | Integer — cuts the row dendrogram into N groups and adds gap lines. Great for showing module structure. |
library(ComplexHeatmap)
library(circlize)
col_fun <- colorRamp2(c(-2, 0, 2), c("#2166AC", "white", "#B2182B"))
ha <- HeatmapAnnotation(
Condition = colData(vsd)$condition,
col = list(Condition = c("Control" = "#4DAF4A", "Treated" = "#E41A1C"))
)
Heatmap(
mat,
name = "Z-score",
col = col_fun,
top_annotation = ha,
show_row_names = TRUE,
show_column_names = TRUE,
row_names_gp = gpar(fontsize = 6),
clustering_method_rows = "ward.D2",
clustering_distance_rows = "pearson",
column_title = "Top 50 Differentially Expressed Genes"
)
# Euclidean distance between samples
dist_mat <- as.matrix(dist(t(assay(vsd))))
pheatmap(
dist_mat,
clustering_distance_rows = as.dist(dist_mat),
clustering_distance_cols = as.dist(dist_mat),
color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
annotation_col = anno_col,
main = "Sample-to-Sample Distance"
)
Static figures are great for papers, but interactive charts let you hover over data points — perfect for exploring results, presentations, and supplementary material.
library(plotly)
# Turn any ggplot into an interactive plot with one line
p <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj),
color = significance, text = gene)) +
geom_point(alpha = 0.5, size = 1.5) +
scale_color_manual(values = c("Up" = "#e74c3c", "Down" = "#3498db", "NS" = "grey70")) +
theme_minimal() +
labs(title = "Interactive Volcano Plot")
ggplotly(p, tooltip = c("text", "x", "y"))
# Save as self-contained HTML
htmlwidgets::saveWidget(ggplotly(p), "volcano_interactive.html")
ggplotly() trick works for PCA plots — hover to see sample names, zoom into clusters, and share the HTML file with collaborators who don't use R.
# Recommended theme stack for publications
theme_pub <- theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.title = element_text(face = "bold"),
legend.position = "bottom",
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold")
)
# Apply to any ggplot
my_plot + theme_pub
| Use Case | Palette | Code |
|---|---|---|
| Categorical (up to 8 groups) | Brewer "Set2" or "Dark2" | scale_color_brewer(palette = "Set2") |
| Diverging (heatmaps, fold change) | Blue-White-Red | scale_fill_gradient2(low="blue", mid="white", high="red") |
| Sequential (expression level) | Viridis | scale_fill_viridis_c() |
| Colorblind-safe | Okabe-Ito or viridis | scale_color_manual(values = c(...)) |
# Vector formats for publication (scalable, small file size)
ggsave("figure1.pdf", plot = my_plot, width = 8, height = 6, dpi = 300)
ggsave("figure1.svg", plot = my_plot, width = 8, height = 6)
# Raster format for supplementary / web
ggsave("figure1.png", plot = my_plot, width = 8, height = 6, dpi = 300)
# Combine multiple plots with patchwork
library(patchwork)
combined <- (pca_plot | volcano_plot) / heatmap_plot +
plot_annotation(tag_levels = "A") # adds A, B, C labels
ggsave("figure_combined.pdf", combined, width = 14, height = 12)
cairo_pdf() if you need embedded fonts.