#!/usr/bin/env Rscript
# =============================================================================
# BioF3 Module 08 (教程 07): 多模态真实示例 — 5k PBMC CITE-seq + WNN
# =============================================================================
#
# 这个脚本在单细胞实践教程 07 章（多模态数据分析）里用到。它：
#   1. 下载 10x Genomics 5k PBMC CITE-seq filtered matrix（约 37 MB）
#   2. 分别对 RNA 和 ADT（抗体捕获）做归一化、降维
#   3. 用 Seurat 的 WNN（Weighted Nearest Neighbor）做联合聚类
#   4. 输出 6 张真实 PNG 到 static/img/tutorial/single-cell/module08/
#
# 数据来源：
#   https://cf.10xgenomics.com/samples/cell-exp/3.1.0/
#     5k_pbmc_protein_v3_nextgem/
#     5k_pbmc_protein_v3_nextgem_filtered_feature_bc_matrix.tar.gz
#
# 用法：
#   Rscript scripts/module08_cite_seq_sci.R
#
# 可选环境变量：
#   BIOF3_DATA_DIR=/path/to/data
#   BIOF3_OUTPUT_DIR=/path/to/static/img/tutorial/single-cell/module08
#
# 依赖：
#   Matrix, Seurat, ggplot2, dplyr, patchwork, RColorBrewer
# =============================================================================

options(stringsAsFactors = FALSE)
set.seed(42)

# ---- 路径 ----------------------------------------------------------------
args <- commandArgs(trailingOnly = FALSE)
file_arg <- grep("^--file=", args, value = TRUE)
script_path <- if (length(file_arg) > 0) {
  normalizePath(sub("^--file=", "", file_arg[[1]]), mustWork = TRUE)
} else {
  normalizePath("scripts/single-cell/sc08_citeseq_sci.R", mustWork = FALSE)
}
script_dir <- dirname(script_path)
project_root <- if (basename(script_dir) == "scripts" && basename(dirname(script_dir)) == "static") {
  dirname(dirname(script_dir))
} else {
  dirname(script_dir)
}

data_root <- Sys.getenv("BIOF3_DATA_DIR", file.path(path.expand("~"), "biof3-data"))
cite_dir  <- file.path(data_root, "pbmc5k-citeseq")
output_dir <- Sys.getenv(
  "BIOF3_OUTPUT_DIR",
  file.path(project_root, "static", "img", "tutorial", "modules", "module08")
)

dir.create(cite_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

cite_url <- paste0(
  "https://cf.10xgenomics.com/samples/cell-exp/3.1.0/",
  "5k_pbmc_protein_v3_nextgem/",
  "5k_pbmc_protein_v3_nextgem_filtered_feature_bc_matrix.tar.gz"
)
cite_tar <- file.path(cite_dir, "5k_pbmc_filtered.tar.gz")
matrix_dir <- file.path(cite_dir, "filtered_feature_bc_matrix")

# ---- 依赖 ---------------------------------------------------------------
suppressPackageStartupMessages({
  library(Matrix)
  library(Seurat)
  library(ggplot2)
  library(dplyr)
  library(patchwork)
  library(RColorBrewer)
})

biof3_colors <- c(
  green = "#0f766e", mint  = "#43d1ae", blue  = "#2563eb",
  amber = "#f59e0b", red   = "#dc2626", slate = "#475569",
  violet = "#7c3aed", pink = "#ec4899"
)

theme_biof3 <- function(base_size = 12) {
  theme_classic(base_size = base_size) +
    theme(
      axis.text  = element_text(color = "#111827"),
      axis.title = element_text(color = "#111827", face = "bold"),
      plot.title = element_text(color = "#12342f", face = "bold", hjust = 0),
      plot.subtitle = element_text(color = "#526760", hjust = 0),
      legend.title = element_text(face = "bold"),
      legend.position = "right",
      strip.background = element_blank(),
      strip.text = element_text(color = "#12342f", face = "bold")
    )
}

save_plot <- function(p, name, width = 8, height = 5) {
  ggsave(file.path(output_dir, name), p, width = width, height = height, dpi = 300, bg = "white")
}

# ---- 下载数据 ------------------------------------------------------------
if (!file.exists(cite_tar)) {
  message("下载 5k PBMC CITE-seq 数据（~37 MB）...")
  download.file(cite_url, destfile = cite_tar, mode = "wb", quiet = FALSE)
}
if (!file.exists(file.path(matrix_dir, "matrix.mtx.gz"))) {
  message("解压...")
  utils::untar(cite_tar, exdir = cite_dir)
}

# ---- 读入数据 ------------------------------------------------------------
message("读取 filtered_feature_bc_matrix...")
counts <- Seurat::Read10X(data.dir = matrix_dir)

# 10x 的 feature-barcode matrix 是一个 list，分别是 Gene Expression 和
# Antibody Capture 两层。Read10X 会自动拆开。
message("RNA 矩阵：", nrow(counts$`Gene Expression`), " 基因 × ", ncol(counts$`Gene Expression`), " 细胞")
message("ADT 矩阵：", nrow(counts$`Antibody Capture`), " 抗体 × ", ncol(counts$`Antibody Capture`), " 细胞")

pbmc <- Seurat::CreateSeuratObject(counts = counts$`Gene Expression`, project = "PBMC5K_CITE")
pbmc[["ADT"]] <- SeuratObject::CreateAssay5Object(counts = counts$`Antibody Capture`)

# 基础 QC
pbmc[["percent.mt"]] <- Seurat::PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 15)
message("QC 后保留 ", ncol(pbmc), " 个细胞")

# ---- RNA 分析 -----------------------------------------------------------
message("RNA: 归一化 + 高变基因 + PCA...")
DefaultAssay(pbmc) <- "RNA"
pbmc <- NormalizeData(pbmc, verbose = FALSE)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
pbmc <- ScaleData(pbmc, verbose = FALSE)
pbmc <- RunPCA(pbmc, npcs = 30, verbose = FALSE, reduction.name = "pca")

# ---- ADT 分析 -----------------------------------------------------------
message("ADT: CLR 归一化 + PCA...")
DefaultAssay(pbmc) <- "ADT"
VariableFeatures(pbmc) <- rownames(pbmc[["ADT"]])
pbmc <- NormalizeData(pbmc, normalization.method = "CLR", margin = 2, verbose = FALSE)
pbmc <- ScaleData(pbmc, verbose = FALSE)
# ADT 维度一般远小于基因数（几十个抗体），PC 数也要小
n_adt_feats <- nrow(pbmc[["ADT"]])
n_adt_pcs <- min(18, n_adt_feats - 1)
pbmc <- RunPCA(pbmc, npcs = n_adt_pcs, reduction.name = "apca", verbose = FALSE)

# ---- WNN --------------------------------------------------------------
message("WNN: 联合 RNA + ADT 邻居图...")
pbmc <- FindMultiModalNeighbors(
  pbmc,
  reduction.list = list("pca", "apca"),
  dims.list      = list(1:30, 1:n_adt_pcs),
  modality.weight.name = "RNA.weight"
)

pbmc <- RunUMAP(pbmc, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_")
pbmc <- FindClusters(pbmc, graph.name = "wsnn", algorithm = 3, resolution = 0.5, verbose = FALSE)

# 同时分别做 RNA-only 和 ADT-only UMAP 作对比
DefaultAssay(pbmc) <- "RNA"
pbmc <- RunUMAP(pbmc, reduction = "pca", dims = 1:30, reduction.name = "rna.umap", reduction.key = "rnaUMAP_")
DefaultAssay(pbmc) <- "ADT"
pbmc <- RunUMAP(pbmc, reduction = "apca", dims = 1:n_adt_pcs, reduction.name = "adt.umap", reduction.key = "adtUMAP_")

# ---- 图 1：三种 UMAP 对比 ----------------------------------------------
message("图 1：三种 UMAP 对比")
p_rna <- DimPlot(pbmc, reduction = "rna.umap", group.by = "seurat_clusters", label = TRUE, repel = TRUE) +
  labs(title = "RNA only") + theme_biof3() + theme(legend.position = "none")
p_adt <- DimPlot(pbmc, reduction = "adt.umap", group.by = "seurat_clusters", label = TRUE, repel = TRUE) +
  labs(title = "ADT only") + theme_biof3() + theme(legend.position = "none")
p_wnn <- DimPlot(pbmc, reduction = "wnn.umap", group.by = "seurat_clusters", label = TRUE, repel = TRUE) +
  labs(title = "WNN joint") + theme_biof3() + theme(legend.position = "none")
p1 <- (p_rna | p_adt | p_wnn) + plot_annotation(
  title = "Clustering across three UMAP embeddings",
  subtitle = "Same WNN cluster labels projected onto RNA-only, ADT-only, and WNN joint UMAP",
  theme = theme(plot.title = element_text(face = "bold", color = "#12342f"),
                plot.subtitle = element_text(color = "#526760"))
)
save_plot(p1, "01-umap-rna-adt-wnn.png", width = 14, height = 5)

# ---- 图 2：WNN modality weight -----------------------------------------
message("图 2：每个细胞的 RNA 权重")
pbmc$rna_weight <- pbmc@meta.data[["RNA.weight"]]
p2 <- FeaturePlot(pbmc, features = "rna_weight", reduction = "wnn.umap") +
  scale_color_gradientn(colors = rev(brewer.pal(11, "RdBu"))) +
  labs(title = "RNA modality weight in WNN",
       subtitle = "Near 1 = cell identity driven by RNA; near 0 = driven by ADT") +
  theme_biof3()
save_plot(p2, "02-wnn-rna-weight.png", width = 8.5, height = 5.5)

# ---- 图 3：几个经典 ADT marker 在 WNN UMAP 上 ---------------------------
message("图 3：ADT marker 在 WNN UMAP")
DefaultAssay(pbmc) <- "ADT"
# 5k PBMC 标配 panel 里一定有的 marker：CD3、CD4、CD8、CD19、CD14、CD56
# 实际 feature 名取决于 10x panel，有些是 "CD3", 有些是 "CD3_TotalSeqB"
ab_all <- rownames(pbmc[["ADT"]])
# Seurat 创建 Assay 时会把下划线换成连字符，所以这里匹配时两种都兼容
find_first <- function(patterns) {
  for (p in patterns) {
    hit <- grep(p, ab_all, value = TRUE, ignore.case = TRUE)
    if (length(hit) > 0) return(hit[1])
  }
  return(NA)
}
# 统一写法：名字[_-]? 既能匹配 "CD3"、"CD3_TotalSeqB"、也能匹配 "CD3-TotalSeqB"
adt_markers <- c(
  CD3  = find_first(c("^CD3[-_]", "^CD3$", "^CD3e")),
  CD4  = find_first(c("^CD4[-_]", "^CD4$")),
  CD8  = find_first(c("^CD8a[-_]", "^CD8A[-_]", "^CD8[-_]", "^CD8a$", "^CD8$")),
  CD19 = find_first(c("^CD19[-_]", "^CD19$")),
  CD14 = find_first(c("^CD14[-_]", "^CD14$")),
  CD56 = find_first(c("^CD56[-_]", "^CD56$"))
)
adt_markers <- adt_markers[!is.na(adt_markers)]
message("使用的 ADT marker: ", paste(names(adt_markers), "=", adt_markers, collapse = "; "))

if (length(adt_markers) > 0) {
  p3 <- FeaturePlot(pbmc, features = unname(adt_markers), reduction = "wnn.umap",
                    cols = c("lightgrey", biof3_colors["red"]), ncol = 3) &
    theme_biof3() &
    theme(legend.position = "right")
  save_plot(p3, "03-adt-markers-wnn.png", width = 13, height = 7)
}

# ---- 图 4：同一 marker 在 RNA vs ADT 层的对比 --------------------------
message("图 4：RNA vs ADT 同一 marker 对比（CD3）")
# 选 CD3D (RNA) 和 CD3 (ADT)
rna_cd3 <- intersect(c("CD3D", "CD3E", "CD3G"), rownames(pbmc[["RNA"]]))[1]
adt_cd3 <- adt_markers["CD3"]
if (!is.na(rna_cd3) && !is.na(adt_cd3)) {
  DefaultAssay(pbmc) <- "RNA"
  p_r <- FeaturePlot(pbmc, features = rna_cd3, reduction = "wnn.umap",
                     cols = c("lightgrey", biof3_colors["blue"])) +
    labs(title = paste0("RNA: ", rna_cd3)) + theme_biof3()
  DefaultAssay(pbmc) <- "ADT"
  p_a <- FeaturePlot(pbmc, features = adt_cd3, reduction = "wnn.umap",
                     cols = c("lightgrey", biof3_colors["red"])) +
    labs(title = paste0("ADT: ", adt_cd3)) + theme_biof3()
  p4 <- (p_r | p_a) + plot_annotation(
    title = "CD3: RNA vs protein expression",
    subtitle = "ADT signal is sharper and more digital, making cell-type boundaries cleaner",
    theme = theme(plot.title = element_text(face = "bold", color = "#12342f"),
                  plot.subtitle = element_text(color = "#526760"))
  )
  save_plot(p4, "04-cd3-rna-vs-adt.png", width = 12, height = 5)
}

# ---- 图 5：ADT marker 在各 cluster 的热图 ------------------------------
message("图 5：ADT marker 在各 cluster 的平均表达")
DefaultAssay(pbmc) <- "ADT"
ab_top <- head(rownames(pbmc[["ADT"]]), 20)  # 取 ADT panel 前 20 个抗体
p5 <- DotPlot(pbmc, features = ab_top, group.by = "seurat_clusters") +
  RotatedAxis() +
  scale_color_gradient(low = "lightgrey", high = biof3_colors["red"]) +
  labs(title = "ADT panel expression per cluster",
       subtitle = "Antibody signatures can directly annotate cluster identity",
       x = NULL, y = "Cluster") +
  theme_biof3() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
save_plot(p5, "05-adt-dotplot.png", width = 12, height = 6.5)

# ---- 图 6：WNN-only 结果的更大 UMAP -----------------------------------
message("图 6：WNN UMAP 完整视图")
p6 <- DimPlot(pbmc, reduction = "wnn.umap", group.by = "seurat_clusters", label = TRUE, label.size = 4, repel = TRUE) +
  labs(title = "5k PBMC CITE-seq WNN clustering",
       subtitle = "Leiden clustering on the joint RNA + ADT neighbor graph") +
  theme_biof3() +
  theme(legend.position = "right")
save_plot(p6, "06-wnn-clusters-labeled.png", width = 9.5, height = 6.5)

# ---- 输出清单 -----------------------------------------------------------
message("=== 已生成 module08 CITE-seq WNN 图 ===")
print(list.files(output_dir, pattern = "\\.png$", full.names = FALSE))
message("输出目录：", normalizePath(output_dir))
message("完成。")

sessionInfo()
