#!/usr/bin/env Rscript
# =============================================================================
# BioF3 Module 11 (教程 10): scATAC-seq 真实示例 — Signac + PBMC 10k
# =============================================================================
#
# 这个脚本在单细胞实践教程 10 章（scATAC-seq 分析）里用到。它：
#   1. 下载 10x PBMC 10k scATAC 的 filtered peak matrix h5 + singlecell.csv
#      （约 200 MB 总计，不需要 fragment 文件）
#   2. 用 Signac 创建 ChromatinAssay → QC（基于 metadata 里的 TSS/frag 信息）
#      → TF-IDF + SVD (LSI) → 聚类 → UMAP
#   3. 估计基因活跃度、画 marker 基因活跃度、找差异 peak
#   4. 输出 6 张真实 PNG 到 static/img/tutorial/single-cell/module11/
#
# 数据来源：
#   https://cf.10xgenomics.com/samples/cell-atac/2.1.0/
#     10k_pbmc_ATACv2_nextgem_Chromium_Controller/
#
# 用法：
#   Rscript scripts/module11_scatac_sci.R
#
# 可选环境变量：
#   BIOF3_DATA_DIR=/path/to/data
#   BIOF3_OUTPUT_DIR=/path/to/static/img/tutorial/single-cell/module11
#
# 依赖：
#   Signac, Seurat, ggplot2, dplyr, patchwork, GenomicRanges,
#   EnsDb.Hsapiens.v86
# =============================================================================

options(stringsAsFactors = FALSE)
set.seed(42)

# ---- 路径 ----------------------------------------------------------------
args <- commandArgs(trailingOnly = FALSE)
file_arg <- grep("^--file=", args, value = TRUE)
script_path <- if (length(file_arg) > 0) {
  normalizePath(sub("^--file=", "", file_arg[[1]]), mustWork = TRUE)
} else {
  normalizePath("scripts/single-cell/sc11_scatac_sci.R", mustWork = FALSE)
}
script_dir <- dirname(script_path)
project_root <- if (basename(script_dir) == "scripts" && basename(dirname(script_dir)) == "static") {
  dirname(dirname(script_dir))
} else {
  dirname(script_dir)
}

data_root <- Sys.getenv("BIOF3_DATA_DIR", file.path(path.expand("~"), "biof3-data"))
atac_dir  <- file.path(data_root, "pbmc10k-scatac")
output_dir <- Sys.getenv(
  "BIOF3_OUTPUT_DIR",
  file.path(project_root, "static", "img", "tutorial", "modules", "module11")
)
dir.create(atac_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

# ---- 下载 ----------------------------------------------------------------
base_url <- paste0(
  "https://cf.10xgenomics.com/samples/cell-atac/2.1.0/",
  "10k_pbmc_ATACv2_nextgem_Chromium_Controller/",
  "10k_pbmc_ATACv2_nextgem_Chromium_Controller_"
)

files_needed <- list(
  h5   = "filtered_peak_bc_matrix.h5",
  meta = "singlecell.csv"
)

for (key in names(files_needed)) {
  local_path <- file.path(atac_dir, files_needed[[key]])
  if (!file.exists(local_path)) {
    url <- paste0(base_url, files_needed[[key]])
    message("下载 ", files_needed[[key]], " ...")
    options(timeout = 600)
    download.file(url, destfile = local_path, mode = "wb", quiet = FALSE)
  }
}

# ---- 依赖 ----------------------------------------------------------------
suppressPackageStartupMessages({
  library(Signac)
  library(Seurat)
  library(ggplot2)
  library(dplyr)
  library(patchwork)
  library(GenomicRanges)
  library(EnsDb.Hsapiens.v86)
})

biof3_colors <- c(
  green = "#0f766e", mint  = "#43d1ae", blue  = "#2563eb",
  amber = "#f59e0b", red   = "#dc2626", slate = "#475569",
  violet = "#7c3aed", pink = "#ec4899"
)

theme_biof3 <- function(base_size = 12) {
  theme_classic(base_size = base_size) +
    theme(
      axis.text  = element_text(color = "#111827"),
      axis.title = element_text(color = "#111827", face = "bold"),
      plot.title = element_text(color = "#12342f", face = "bold", hjust = 0),
      plot.subtitle = element_text(color = "#526760", hjust = 0),
      legend.title = element_text(face = "bold"),
      legend.position = "right",
      strip.background = element_blank(),
      strip.text = element_text(color = "#12342f", face = "bold")
    )
}

save_plot <- function(p, name, width = 8, height = 5) {
  ggsave(file.path(output_dir, name), p, width = width, height = height, dpi = 300, bg = "white")
}

# ---- 创建对象 ------------------------------------------------------------
message("读取 peak matrix ...")
counts <- Read10X_h5(file.path(atac_dir, files_needed$h5))
metadata <- read.csv(file.path(atac_dir, files_needed$meta), row.names = 1)

# 只保留 peak matrix 里的 barcode
common_bc <- intersect(colnames(counts), rownames(metadata))
counts <- counts[, common_bc]
metadata <- metadata[common_bc, , drop = FALSE]

chrom_assay <- CreateChromatinAssay(
  counts    = counts,
  sep       = c(":", "-"),
  min.cells = 10,
  min.features = 200
)

pbmc <- CreateSeuratObject(
  counts    = chrom_assay,
  assay     = "peaks",
  meta.data = metadata
)
message("初始细胞数：", ncol(pbmc), "  peak 数：", nrow(pbmc))

# 添加基因注释
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotations) <- "UCSC"
Annotation(pbmc) <- annotations

# ---- QC ------------------------------------------------------------------
# singlecell.csv 里已经有 Cell Ranger 算好的 TSS enrichment 和 fragment 数
# 列名可能是 TSS_fragments / passed_filters / peak_region_fragments 等
message("QC: 使用 singlecell.csv 里的预计算指标")
message("metadata 列：", paste(colnames(pbmc@meta.data)[1:20], collapse = ", "))

# 用 passed_filters (总 fragment) 和 peak_region_fragments 做 QC
if ("passed_filters" %in% colnames(pbmc@meta.data)) {
  pbmc$total_fragments <- pbmc@meta.data$passed_filters
} else {
  pbmc$total_fragments <- pbmc$nCount_peaks
}

if ("frac_peak" %in% colnames(pbmc@meta.data)) {
  # 已有
} else if ("peak_region_fragments" %in% colnames(pbmc@meta.data) &&
           "passed_filters" %in% colnames(pbmc@meta.data)) {
  pbmc$frac_peak <- pbmc@meta.data$peak_region_fragments / pbmc@meta.data$passed_filters
} else {
  pbmc$frac_peak <- 1
}

# 过滤
pbmc <- subset(pbmc,
  subset = nCount_peaks > 1000 &
           nCount_peaks < 100000
)
message("QC 后保留 ", ncol(pbmc), " 个细胞")

# ---- 图 1：QC 散点图 ---------------------------------------------------
message("图 1：QC 散点图")
p1a <- ggplot(pbmc@meta.data, aes(x = log10(nCount_peaks), y = frac_peak)) +
  geom_point(size = 0.2, alpha = 0.3, color = biof3_colors["blue"]) +
  labs(x = "log10(Peak fragments)", y = "Fraction in peaks",
       title = "Fragments vs fraction in peaks") +
  theme_biof3()
p1b <- ggplot(pbmc@meta.data, aes(x = nCount_peaks)) +
  geom_histogram(bins = 80, fill = biof3_colors["green"], color = "white", linewidth = 0.2) +
  labs(x = "Peak fragments", y = "Cells", title = "Fragment count distribution") +
  theme_biof3()
p1 <- (p1a | p1b) + plot_annotation(
  title = "scATAC QC: sequencing depth and signal quality",
  subtitle = "Higher fraction in peaks = signal concentrated in meaningful open regions",
  theme = theme(plot.title = element_text(face = "bold", color = "#12342f"),
                plot.subtitle = element_text(color = "#526760"))
)
save_plot(p1, "01-qc-scatter.png", width = 12, height = 5)

# ---- TF-IDF + LSI + UMAP + 聚类 ----------------------------------------
message("TF-IDF + SVD (LSI) ...")
pbmc <- RunTFIDF(pbmc)
pbmc <- FindTopFeatures(pbmc, min.cutoff = "q0")
pbmc <- RunSVD(pbmc)

# 检查 LSI 第一维和深度的相关性
dc_plot <- DepthCor(pbmc, n = 30)
cor_vals <- dc_plot$data$nCount_peaks
names(cor_vals) <- dc_plot$data$Component
message("LSI dim 1 与深度相关性：", round(cor_vals[1], 3))

message("UMAP + 聚类 ...")
pbmc <- RunUMAP(pbmc, reduction = "lsi", dims = 2:30)
pbmc <- FindNeighbors(pbmc, reduction = "lsi", dims = 2:30)
pbmc <- FindClusters(pbmc, algorithm = 3, verbose = FALSE, resolution = 0.5)
message("聚类数：", length(levels(pbmc$seurat_clusters)))

# ---- 图 2：UMAP 聚类 ---------------------------------------------------
message("图 2：UMAP 聚类")
p2 <- DimPlot(pbmc, label = TRUE, repel = TRUE) +
  labs(title = "scATAC UMAP: TF-IDF + LSI clustering",
       subtitle = paste0(length(levels(pbmc$seurat_clusters)), " clusters, dims 2:30 (skipping dim 1)")) +
  theme_biof3()
save_plot(p2, "02-umap-clusters.png", width = 9, height = 6)

# ---- 图 3：LSI 深度相关性 -----------------------------------------------
message("图 3：LSI 各维度与测序深度的相关性")
df_cor <- data.frame(dim = seq_along(cor_vals), cor = cor_vals)
p3 <- ggplot(df_cor, aes(x = dim, y = cor)) +
  geom_col(fill = ifelse(abs(df_cor$cor) > 0.5, biof3_colors["red"], biof3_colors["slate"])) +
  geom_hline(yintercept = c(-0.5, 0.5), linetype = "dashed", color = "grey50") +
  labs(title = "LSI component correlation with sequencing depth",
       subtitle = "Dim 1 is typically depth-correlated (red); downstream analysis starts from dim 2",
       x = "LSI component", y = "Pearson r") +
  theme_biof3()
save_plot(p3, "03-lsi-depth-cor.png", width = 9, height = 5)

# ---- 图 4：差异 peak dotplot -------------------------------------------
message("图 4：差异 peak")
DefaultAssay(pbmc) <- "peaks"
da_peaks <- FindAllMarkers(
  pbmc,
  only.pos = TRUE,
  min.pct  = 0.1,
  test.use = "wilcox",
  verbose = FALSE
)
message("差异 peak 总数：", nrow(da_peaks))

top_peaks <- da_peaks %>%
  group_by(cluster) %>%
  slice_max(order_by = avg_log2FC, n = 2) %>%
  ungroup()

p4 <- DotPlot(pbmc, features = unique(top_peaks$gene), group.by = "seurat_clusters") +
  RotatedAxis() +
  scale_color_gradient(low = "lightgrey", high = biof3_colors["violet"]) +
  labs(title = "Top differential peaks per cluster",
       subtitle = "Top 2 peaks by avg_log2FC for each cluster",
       x = NULL, y = "Cluster") +
  theme_biof3() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5))
save_plot(p4, "04-da-peaks-dotplot.png", width = 14, height = 6.5)

# ---- 图 5：top 差异 peak 在 UMAP 上的表达 ------------------------------
message("图 5：top 差异 peak 在 UMAP 上")
# 取前 6 个 cluster 各自最强的 1 个 peak
top6 <- da_peaks %>%
  group_by(cluster) %>%
  slice_max(order_by = avg_log2FC, n = 1) %>%
  ungroup() %>%
  head(6)

p5 <- FeaturePlot(pbmc, features = top6$gene, ncol = 3,
                  cols = c("lightgrey", biof3_colors["violet"]),
                  reduction = "umap", pt.size = 0.1) &
  theme_biof3() & theme(legend.position = "right")
save_plot(p5, "05-top-peaks-umap.png", width = 13, height = 7)

# ---- 图 6：每个 cluster 的 peak 数量和深度 ----------------------------
message("图 6：各 cluster 的测序深度分布")
p6 <- VlnPlot(pbmc, features = "nCount_peaks", group.by = "seurat_clusters", pt.size = 0) +
  scale_fill_manual(values = rep(unname(biof3_colors), length.out = 20), guide = "none") +
  labs(title = "Peak fragment distribution per cluster",
       subtitle = "Confirms clustering is not driven by sequencing depth",
       x = "Cluster", y = "nCount_peaks") +
  theme_biof3()
save_plot(p6, "06-depth-per-cluster.png", width = 10, height = 5.5)

# ---- 输出清单 -----------------------------------------------------------
message("=== 已生成 module11 scATAC 图 ===")
print(list.files(output_dir, pattern = "\\.png$", full.names = FALSE))
message("输出目录：", normalizePath(output_dir))
message("完成。")

sessionInfo()