#!/usr/bin/env Rscript # ============================================================================= # BioF3 Module 11 (教程 10): scATAC-seq 真实示例 — Signac + PBMC 10k # ============================================================================= # # 这个脚本在单细胞实践教程 10 章(scATAC-seq 分析)里用到。它: # 1. 下载 10x PBMC 10k scATAC 的 filtered peak matrix h5 + singlecell.csv # (约 200 MB 总计,不需要 fragment 文件) # 2. 用 Signac 创建 ChromatinAssay → QC(基于 metadata 里的 TSS/frag 信息) # → TF-IDF + SVD (LSI) → 聚类 → UMAP # 3. 估计基因活跃度、画 marker 基因活跃度、找差异 peak # 4. 输出 6 张真实 PNG 到 static/img/tutorial/single-cell/module11/ # # 数据来源: # https://cf.10xgenomics.com/samples/cell-atac/2.1.0/ # 10k_pbmc_ATACv2_nextgem_Chromium_Controller/ # # 用法: # Rscript scripts/module11_scatac_sci.R # # 可选环境变量: # BIOF3_DATA_DIR=/path/to/data # BIOF3_OUTPUT_DIR=/path/to/static/img/tutorial/single-cell/module11 # # 依赖: # Signac, Seurat, ggplot2, dplyr, patchwork, GenomicRanges, # EnsDb.Hsapiens.v86 # ============================================================================= options(stringsAsFactors = FALSE) set.seed(42) # ---- 路径 ---------------------------------------------------------------- args <- commandArgs(trailingOnly = FALSE) file_arg <- grep("^--file=", args, value = TRUE) script_path <- if (length(file_arg) > 0) { normalizePath(sub("^--file=", "", file_arg[[1]]), mustWork = TRUE) } else { normalizePath("scripts/single-cell/sc11_scatac_sci.R", mustWork = FALSE) } script_dir <- dirname(script_path) project_root <- if (basename(script_dir) == "scripts" && basename(dirname(script_dir)) == "static") { dirname(dirname(script_dir)) } else { dirname(script_dir) } data_root <- Sys.getenv("BIOF3_DATA_DIR", file.path(path.expand("~"), "biof3-data")) atac_dir <- file.path(data_root, "pbmc10k-scatac") output_dir <- Sys.getenv( "BIOF3_OUTPUT_DIR", file.path(project_root, "static", "img", "tutorial", "modules", "module11") ) dir.create(atac_dir, recursive = TRUE, showWarnings = FALSE) dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) # ---- 下载 ---------------------------------------------------------------- base_url <- paste0( "https://cf.10xgenomics.com/samples/cell-atac/2.1.0/", "10k_pbmc_ATACv2_nextgem_Chromium_Controller/", "10k_pbmc_ATACv2_nextgem_Chromium_Controller_" ) files_needed <- list( h5 = "filtered_peak_bc_matrix.h5", meta = "singlecell.csv" ) for (key in names(files_needed)) { local_path <- file.path(atac_dir, files_needed[[key]]) if (!file.exists(local_path)) { url <- paste0(base_url, files_needed[[key]]) message("下载 ", files_needed[[key]], " ...") options(timeout = 600) download.file(url, destfile = local_path, mode = "wb", quiet = FALSE) } } # ---- 依赖 ---------------------------------------------------------------- suppressPackageStartupMessages({ library(Signac) library(Seurat) library(ggplot2) library(dplyr) library(patchwork) library(GenomicRanges) library(EnsDb.Hsapiens.v86) }) biof3_colors <- c( green = "#0f766e", mint = "#43d1ae", blue = "#2563eb", amber = "#f59e0b", red = "#dc2626", slate = "#475569", violet = "#7c3aed", pink = "#ec4899" ) theme_biof3 <- function(base_size = 12) { theme_classic(base_size = base_size) + theme( axis.text = element_text(color = "#111827"), axis.title = element_text(color = "#111827", face = "bold"), plot.title = element_text(color = "#12342f", face = "bold", hjust = 0), plot.subtitle = element_text(color = "#526760", hjust = 0), legend.title = element_text(face = "bold"), legend.position = "right", strip.background = element_blank(), strip.text = element_text(color = "#12342f", face = "bold") ) } save_plot <- function(p, name, width = 8, height = 5) { ggsave(file.path(output_dir, name), p, width = width, height = height, dpi = 300, bg = "white") } # ---- 创建对象 ------------------------------------------------------------ message("读取 peak matrix ...") counts <- Read10X_h5(file.path(atac_dir, files_needed$h5)) metadata <- read.csv(file.path(atac_dir, files_needed$meta), row.names = 1) # 只保留 peak matrix 里的 barcode common_bc <- intersect(colnames(counts), rownames(metadata)) counts <- counts[, common_bc] metadata <- metadata[common_bc, , drop = FALSE] chrom_assay <- CreateChromatinAssay( counts = counts, sep = c(":", "-"), min.cells = 10, min.features = 200 ) pbmc <- CreateSeuratObject( counts = chrom_assay, assay = "peaks", meta.data = metadata ) message("初始细胞数:", ncol(pbmc), " peak 数:", nrow(pbmc)) # 添加基因注释 annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) seqlevelsStyle(annotations) <- "UCSC" Annotation(pbmc) <- annotations # ---- QC ------------------------------------------------------------------ # singlecell.csv 里已经有 Cell Ranger 算好的 TSS enrichment 和 fragment 数 # 列名可能是 TSS_fragments / passed_filters / peak_region_fragments 等 message("QC: 使用 singlecell.csv 里的预计算指标") message("metadata 列:", paste(colnames(pbmc@meta.data)[1:20], collapse = ", ")) # 用 passed_filters (总 fragment) 和 peak_region_fragments 做 QC if ("passed_filters" %in% colnames(pbmc@meta.data)) { pbmc$total_fragments <- pbmc@meta.data$passed_filters } else { pbmc$total_fragments <- pbmc$nCount_peaks } if ("frac_peak" %in% colnames(pbmc@meta.data)) { # 已有 } else if ("peak_region_fragments" %in% colnames(pbmc@meta.data) && "passed_filters" %in% colnames(pbmc@meta.data)) { pbmc$frac_peak <- pbmc@meta.data$peak_region_fragments / pbmc@meta.data$passed_filters } else { pbmc$frac_peak <- 1 } # 过滤 pbmc <- subset(pbmc, subset = nCount_peaks > 1000 & nCount_peaks < 100000 ) message("QC 后保留 ", ncol(pbmc), " 个细胞") # ---- 图 1:QC 散点图 --------------------------------------------------- message("图 1:QC 散点图") p1a <- ggplot(pbmc@meta.data, aes(x = log10(nCount_peaks), y = frac_peak)) + geom_point(size = 0.2, alpha = 0.3, color = biof3_colors["blue"]) + labs(x = "log10(Peak fragments)", y = "Fraction in peaks", title = "Fragments vs fraction in peaks") + theme_biof3() p1b <- ggplot(pbmc@meta.data, aes(x = nCount_peaks)) + geom_histogram(bins = 80, fill = biof3_colors["green"], color = "white", linewidth = 0.2) + labs(x = "Peak fragments", y = "Cells", title = "Fragment count distribution") + theme_biof3() p1 <- (p1a | p1b) + plot_annotation( title = "scATAC QC: sequencing depth and signal quality", subtitle = "Higher fraction in peaks = signal concentrated in meaningful open regions", theme = theme(plot.title = element_text(face = "bold", color = "#12342f"), plot.subtitle = element_text(color = "#526760")) ) save_plot(p1, "01-qc-scatter.png", width = 12, height = 5) # ---- TF-IDF + LSI + UMAP + 聚类 ---------------------------------------- message("TF-IDF + SVD (LSI) ...") pbmc <- RunTFIDF(pbmc) pbmc <- FindTopFeatures(pbmc, min.cutoff = "q0") pbmc <- RunSVD(pbmc) # 检查 LSI 第一维和深度的相关性 dc_plot <- DepthCor(pbmc, n = 30) cor_vals <- dc_plot$data$nCount_peaks names(cor_vals) <- dc_plot$data$Component message("LSI dim 1 与深度相关性:", round(cor_vals[1], 3)) message("UMAP + 聚类 ...") pbmc <- RunUMAP(pbmc, reduction = "lsi", dims = 2:30) pbmc <- FindNeighbors(pbmc, reduction = "lsi", dims = 2:30) pbmc <- FindClusters(pbmc, algorithm = 3, verbose = FALSE, resolution = 0.5) message("聚类数:", length(levels(pbmc$seurat_clusters))) # ---- 图 2:UMAP 聚类 --------------------------------------------------- message("图 2:UMAP 聚类") p2 <- DimPlot(pbmc, label = TRUE, repel = TRUE) + labs(title = "scATAC UMAP: TF-IDF + LSI clustering", subtitle = paste0(length(levels(pbmc$seurat_clusters)), " clusters, dims 2:30 (skipping dim 1)")) + theme_biof3() save_plot(p2, "02-umap-clusters.png", width = 9, height = 6) # ---- 图 3:LSI 深度相关性 ----------------------------------------------- message("图 3:LSI 各维度与测序深度的相关性") df_cor <- data.frame(dim = seq_along(cor_vals), cor = cor_vals) p3 <- ggplot(df_cor, aes(x = dim, y = cor)) + geom_col(fill = ifelse(abs(df_cor$cor) > 0.5, biof3_colors["red"], biof3_colors["slate"])) + geom_hline(yintercept = c(-0.5, 0.5), linetype = "dashed", color = "grey50") + labs(title = "LSI component correlation with sequencing depth", subtitle = "Dim 1 is typically depth-correlated (red); downstream analysis starts from dim 2", x = "LSI component", y = "Pearson r") + theme_biof3() save_plot(p3, "03-lsi-depth-cor.png", width = 9, height = 5) # ---- 图 4:差异 peak dotplot ------------------------------------------- message("图 4:差异 peak") DefaultAssay(pbmc) <- "peaks" da_peaks <- FindAllMarkers( pbmc, only.pos = TRUE, min.pct = 0.1, test.use = "wilcox", verbose = FALSE ) message("差异 peak 总数:", nrow(da_peaks)) top_peaks <- da_peaks %>% group_by(cluster) %>% slice_max(order_by = avg_log2FC, n = 2) %>% ungroup() p4 <- DotPlot(pbmc, features = unique(top_peaks$gene), group.by = "seurat_clusters") + RotatedAxis() + scale_color_gradient(low = "lightgrey", high = biof3_colors["violet"]) + labs(title = "Top differential peaks per cluster", subtitle = "Top 2 peaks by avg_log2FC for each cluster", x = NULL, y = "Cluster") + theme_biof3() + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5)) save_plot(p4, "04-da-peaks-dotplot.png", width = 14, height = 6.5) # ---- 图 5:top 差异 peak 在 UMAP 上的表达 ------------------------------ message("图 5:top 差异 peak 在 UMAP 上") # 取前 6 个 cluster 各自最强的 1 个 peak top6 <- da_peaks %>% group_by(cluster) %>% slice_max(order_by = avg_log2FC, n = 1) %>% ungroup() %>% head(6) p5 <- FeaturePlot(pbmc, features = top6$gene, ncol = 3, cols = c("lightgrey", biof3_colors["violet"]), reduction = "umap", pt.size = 0.1) & theme_biof3() & theme(legend.position = "right") save_plot(p5, "05-top-peaks-umap.png", width = 13, height = 7) # ---- 图 6:每个 cluster 的 peak 数量和深度 ---------------------------- message("图 6:各 cluster 的测序深度分布") p6 <- VlnPlot(pbmc, features = "nCount_peaks", group.by = "seurat_clusters", pt.size = 0) + scale_fill_manual(values = rep(unname(biof3_colors), length.out = 20), guide = "none") + labs(title = "Peak fragment distribution per cluster", subtitle = "Confirms clustering is not driven by sequencing depth", x = "Cluster", y = "nCount_peaks") + theme_biof3() save_plot(p6, "06-depth-per-cluster.png", width = 10, height = 5.5) # ---- 输出清单 ----------------------------------------------------------- message("=== 已生成 module11 scATAC 图 ===") print(list.files(output_dir, pattern = "\\.png$", full.names = FALSE)) message("输出目录:", normalizePath(output_dir)) message("完成。") sessionInfo()