Single-Cell Multiome: Joint ATAC + RNA Analysis

Simultaneously profile chromatin accessibility (ATAC-seq) and gene expression (RNA-seq) in the same single cell. Covers Cell Ranger ARC output, Signac + Seurat WNN integration, peak calling, motif enrichment, and gene regulatory network inference linking peaks to genes.

~60 min Advanced R / Seurat / Signac Multi-omics
Byte

Why Joint ATAC + RNA?

Single-cell ATAC-seq alone tells you which genomic regions are open (accessible) in each cell — a proxy for regulatory activity. RNA-seq alone tells you gene expression. Combining them in the same cell with 10x Multiome (or SHARE-seq, PAIRED-seq) lets you:

  • Directly link cis-regulatory elements (enhancers) to the genes they regulate
  • Identify transcription factor (TF) binding sites driving cell-type-specific expression
  • Study chromatin remodelling during differentiation trajectories
  • Infer gene regulatory networks with chromatin evidence
  • Better resolve ambiguous cell types using both modalities via WNN (Weighted Nearest Neighbours)
Byte
WNN integration (Hao et al., Cell 2021) weights each modality per cell based on its information content — if RNA is noisy in a cell, it up-weights ATAC. This gives more accurate clustering than either modality alone.

Installation

R
# In R:
install.packages(c("Seurat", "Signac", "ggplot2", "dplyr", "patchwork"))
BiocManager::install(c("GenomicRanges", "BSgenome.Hsapiens.UCSC.hg38",
                        "EnsDb.Hsapiens.v86", "chromVAR", "motifmatchr",
                        "TFBSTools", "JASPAR2020"))

# Verify
library(Seurat)
library(Signac)
packageVersion("Signac")  # should be >= 1.12
Bash
# MACS2 for peak calling (Python)
pip install macs2
macs2 --version

1Load Cell Ranger ARC Output

R
library(Seurat)
library(Signac)
library(EnsDb.Hsapiens.v86)
library(GenomicRanges)

# Cell Ranger ARC output directory structure:
# outs/
# ├── filtered_feature_bc_matrix/    (RNA counts)
# ├── atac_fragments.tsv.gz          (ATAC fragments)
# ├── atac_peaks.bed                 (called peaks)
# └── per_barcode_metrics.csv        (QC per cell)

# Read RNA counts
rna_counts <- Read10X_h5("outs/filtered_feature_bc_matrix.h5")
rna_mat <- rna_counts$`Gene Expression`

# Read ATAC peak counts
atac_counts <- rna_counts$Peaks

# Get genome annotations for hg38
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotations) <- "UCSC"
genome(annotations) <- "hg38"

# Create ChromatinAssay for ATAC data
chrom_assay <- CreateChromatinAssay(
  counts    = atac_counts,
  sep       = c(":", "-"),
  genome    = "hg38",
  fragments = "outs/atac_fragments.tsv.gz",
  min.cells = 10,
  annotation = annotations
)

# Create Seurat object with both modalities
pbmc <- CreateSeuratObject(
  counts    = rna_mat,
  assay     = "RNA",
  min.cells = 10
)
pbmc[["ATAC"]] <- chrom_assay

cat("Cells:", ncol(pbmc), "\n")
cat("RNA features:", nrow(pbmc[["RNA"]]), "\n")
cat("ATAC peaks:", nrow(pbmc[["ATAC"]]), "\n")

2RNA QC & Processing

R
library(dplyr)

# QC metrics
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

# Filter cells
pbmc <- subset(
  pbmc,
  subset = nFeature_RNA > 200 &
           nFeature_RNA < 5000 &
           percent.mt < 20
)

# Normalise and find variable features
DefaultAssay(pbmc) <- "RNA"
pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc, nfeatures = 3000)
pbmc <- ScaleData(pbmc)
pbmc <- RunPCA(pbmc, npcs = 50)

cat("After QC:", ncol(pbmc), "cells\n")

3ATAC QC & Processing

R
DefaultAssay(pbmc) <- "ATAC"

# ATAC-specific QC metrics
pbmc <- NucleosomeSignal(pbmc)
pbmc <- TSSEnrichment(pbmc, fast = FALSE)

# Visualise ATAC QC
VlnPlot(
  pbmc,
  features = c("nCount_ATAC", "TSS.enrichment", "nucleosome_signal"),
  ncol = 3,
  pt.size = 0
)

# Filter cells based on ATAC QC
pbmc <- subset(
  pbmc,
  subset = nCount_ATAC > 1000 &
           nCount_ATAC < 100000 &
           nucleosome_signal < 2 &
           TSS.enrichment > 1
)

# Term-frequency inverse-document-frequency (TF-IDF) normalisation
pbmc <- RunTFIDF(pbmc)

# Feature selection — top accessible peaks
pbmc <- FindTopFeatures(pbmc, min.cutoff = "q5")

# SVD (LSI) dimensionality reduction
pbmc <- RunSVD(pbmc)

# Check if first LSI component correlates with depth (should be excluded)
DepthCor(pbmc)
# If component 1 correlates strongly with depth, exclude it in downstream steps

4WNN Joint Clustering

Weighted Nearest Neighbours (WNN) combines RNA PCA and ATAC LSI embeddings, weighting each modality adaptively per cell.

R
library(patchwork)

# Build WNN graph using both modalities
pbmc <- FindMultiModalNeighbors(
  pbmc,
  reduction.list = list("pca", "lsi"),
  dims.list      = list(1:50, 2:50),   # exclude LSI dim 1 (depth correlated)
  modality.weight.name = "RNA.weight"
)

# UMAP from WNN
pbmc <- RunUMAP(
  pbmc,
  nn.name  = "weighted.nn",
  reduction.name = "wnn.umap",
  reduction.key  = "wnnUMAP_"
)

# Also make RNA-only and ATAC-only UMAPs for comparison
pbmc <- RunUMAP(pbmc, reduction="pca",  dims=1:50,
                reduction.name="rna.umap", reduction.key="rnaUMAP_")
pbmc <- RunUMAP(pbmc, reduction="lsi",  dims=2:50,
                reduction.name="atac.umap", reduction.key="atacUMAP_")

# Cluster on WNN graph
pbmc <- FindClusters(pbmc, graph.name = "wsnn",
                     algorithm = 3, resolution = 0.8)

# Compare UMAPs side by side
p1 <- DimPlot(pbmc, reduction="rna.umap",  group.by="seurat_clusters",
              label=TRUE) + ggtitle("RNA only") + NoLegend()
p2 <- DimPlot(pbmc, reduction="atac.umap", group.by="seurat_clusters",
              label=TRUE) + ggtitle("ATAC only") + NoLegend()
p3 <- DimPlot(pbmc, reduction="wnn.umap",  group.by="seurat_clusters",
              label=TRUE) + ggtitle("WNN (joint)") + NoLegend()

p1 | p2 | p3
ggsave("wnn_comparison_umap.pdf", width=18, height=6)

5Peak Calling per Cell Type with MACS2

Cell Ranger ARC calls peaks globally. Re-calling peaks per cell type gives higher-resolution, cluster-specific accessibility profiles.

R
# Assign cell type labels first (from RNA markers)
pbmc <- RenameIdents(
  pbmc,
  "0"  = "CD14 Mono",
  "1"  = "CD4 T",
  "2"  = "CD8 T",
  "3"  = "B cell",
  "4"  = "NK",
  "5"  = "CD16 Mono",
  "6"  = "DC",
  "7"  = "Platelet"
)
pbmc$cell_type <- Idents(pbmc)

# Call peaks per cell type using MACS2
peaks <- CallPeaks(
  pbmc,
  group.by = "cell_type",
  macs2.path = "/path/to/macs2"
)

# Keep only standard chromosomes
peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse")

# Quantify fragments in new peaks
macs2_counts <- FeatureMatrix(
  fragments = Fragments(pbmc),
  features  = peaks,
  cells     = colnames(pbmc)
)

# Create new ATAC assay with MACS2 peaks
pbmc[["MACS2"]] <- CreateChromatinAssay(
  counts     = macs2_counts,
  fragments  = Fragments(pbmc),
  annotation = Annotation(pbmc)
)

cat("MACS2 peaks called:", nrow(peaks), "\n")
cat("Original peaks:", nrow(pbmc[["ATAC"]]), "\n")

6Transcription Factor Motif Enrichment

R
library(JASPAR2020)
library(TFBSTools)
library(chromVAR)
library(motifmatchr)
library(BSgenome.Hsapiens.UCSC.hg38)

DefaultAssay(pbmc) <- "MACS2"

# Get JASPAR 2020 motif database (vertebrates)
pfm <- getMatrixSet(
  JASPAR2020,
  opts = list(collection = "CORE", tax_group = "vertebrates",
              all_versions = FALSE)
)

# Scan peaks for TF motif matches
pbmc <- AddMotifs(pbmc, genome = BSgenome.Hsapiens.UCSC.hg38, pfm = pfm)

# Run chromVAR to compute per-cell TF activity scores
pbmc <- RunChromVAR(
  object = pbmc,
  genome = BSgenome.Hsapiens.UCSC.hg38
)

# Differential TF activity between cell types
DefaultAssay(pbmc) <- "chromvar"
da_tf <- FindMarkers(
  pbmc,
  ident.1  = "CD8 T",
  ident.2  = "CD4 T",
  only.pos = TRUE,
  mean.fxn = rowMeans,
  fc.name  = "avg_diff"
)

# Map motif IDs to TF names
motif_names <- GetMotifData(pbmc, slot = "motif.names")
da_tf$tf_name <- motif_names[rownames(da_tf)]

cat("Top TFs enriched in CD8 T vs CD4 T:\n")
print(head(da_tf[, c("tf_name","avg_diff","p_val_adj")], 15))

# Visualise TF activity
DefaultAssay(pbmc) <- "chromvar"
FeaturePlot(
  pbmc,
  features = c("MA0080.5", "MA0105.4"),   # IRF4, RUNX1
  reduction = "wnn.umap",
  min.cutoff = "q10",
  max.cutoff = "q90",
  ncol = 2
) &
scale_color_gradient2(low="blue", mid="lightgrey", high="red") &
ggtitle(c("IRF4 motif activity", "RUNX1 motif activity"))
ggsave("tf_motif_activity.pdf", width=12, height=5)

7Peak-to-Gene Linkage

Linking open chromatin peaks to the genes they regulate is the core multiome analysis — enabled by having ATAC and RNA from the same cell.

R
DefaultAssay(pbmc) <- "MACS2"

# Link peaks to genes using co-variation across cells
# (peaks that open/close together with gene expression → likely regulatory)
pbmc <- LinkPeaks(
  object     = pbmc,
  peak.assay = "MACS2",
  expression.assay = "RNA",
  genes.use  = VariableFeatures(pbmc)
)

# View links
links <- Links(pbmc)
cat("Total peak-gene links:", length(links), "\n")

# Filter to strong, significant links
strong_links <- links[links$score > 0.1 & links$pvalue < 0.05]
cat("Strong links (score>0.1, p<0.05):", length(strong_links), "\n")

# Visualise coverage plot for a specific gene
CoveragePlot(
  object    = pbmc,
  region    = "CD8A",
  features  = "CD8A",
  expression.assay = "RNA",
  idents    = c("CD8 T", "CD4 T", "CD14 Mono"),
  extend.upstream   = 5000,
  extend.downstream = 5000
)
ggsave("CD8A_coverage_plot.pdf", width=10, height=8)

8Differential Accessibility Analysis

R
DefaultAssay(pbmc) <- "MACS2"

# Find differentially accessible peaks between cell types
da_peaks <- FindMarkers(
  pbmc,
  ident.1  = "CD8 T",
  ident.2  = "CD4 T",
  only.pos = TRUE,
  test.use = "LR",          # logistic regression (recommended for ATAC)
  latent.vars = "nCount_ATAC"   # control for sequencing depth
)

# Annotate peaks (nearest gene, peak type: promoter/enhancer/intron)
da_peaks_annotated <- ClosestFeature(pbmc, regions = rownames(da_peaks))
da_peaks <- cbind(da_peaks, da_peaks_annotated)

cat("Differentially accessible peaks (CD8 > CD4):", nrow(da_peaks), "\n")

# Promoter vs enhancer breakdown
table(da_peaks$type)

# Volcano plot
library(ggplot2)
library(ggrepel)

da_peaks$neg_log10_padj <- -log10(da_peaks$p_val_adj + 1e-300)
da_peaks$label <- ifelse(da_peaks$p_val_adj < 1e-10 & abs(da_peaks$avg_log2FC) > 1,
                          da_peaks$gene_name, "")

ggplot(da_peaks, aes(x=avg_log2FC, y=neg_log10_padj)) +
  geom_point(aes(colour=type), alpha=0.6, size=1.2) +
  geom_text_repel(aes(label=label), size=3, max.overlaps=20) +
  geom_vline(xintercept=c(-1,1), linetype="dashed", colour="grey50") +
  geom_hline(yintercept=-log10(0.05), linetype="dashed", colour="grey50") +
  scale_colour_manual(values=c("Promoter"="#4CAF50","Distal Intergenic"="#F44336","Intron"="#FF9800")) +
  labs(x="Average log2 fold change", y="-log10(adjusted p-value)",
       title="Differentially accessible peaks: CD8 T vs CD4 T",
       colour="Peak type") +
  theme_classic(base_size=13)
ggsave("da_peaks_volcano.pdf", width=10, height=7, dpi=300)

Summary

Byte
Full Multiome pipeline:
  1. Load Cell Ranger ARC output → create Seurat object with RNA + ATAC assays
  2. Process RNA (NormalizeData → PCA) and ATAC (TF-IDF → LSI) independently
  3. Join with WNN for better clustering than either modality alone
  4. Re-call peaks per cell type with MACS2 for higher resolution
  5. TF motif enrichment with chromVAR + JASPAR2020
  6. LinkPeaks to identify cis-regulatory elements driving gene expression
  7. Differential accessibility with logistic regression controlling for depth
Related Tutorials