#!/usr/bin/env Rscript
# =============================================================================
# BioF3 Module 06 (教程 05): 轨迹推断真实示例 — PBMC 3k + Slingshot
# =============================================================================
#
# 这个脚本在单细胞实践教程 05 章（轨迹推断）里用到。它：
#   1. 读取 PBMC 3k 真实矩阵（和 module01 / 03 / 04 用的是同一份）
#   2. 做 QC、归一化、降维、聚类（和 module03 流程一致）
#   3. 用 Slingshot 在 PCA 空间构建主曲线，拿到拟时序
#   4. 把 marker 基因表达投到拟时序上
#   5. 输出 6 张真实 PNG 到 static/img/tutorial/single-cell/module06/
#
# PBMC 3k 不是严格的分化数据集，但 B、T、NK、monocyte 这些免疫细胞
# 谱系之间的关系让 Slingshot 仍然能拟出有意义的主曲线。这一章拿它
# 做演示，读者可以在自己的分化数据（例如 HSC、胚胎发育、CD8 记忆-
# 效应）上复用相同的流程。
#
# 用法：
#   Rscript scripts/module06_trajectory_sci.R
#
# 可选环境变量：
#   BIOF3_DATA_DIR=/path/to/data
#   BIOF3_OUTPUT_DIR=/path/to/static/img/tutorial/single-cell/module06
#
# 依赖：
#   Matrix, Seurat, SingleCellExperiment, slingshot,
#   ggplot2, dplyr, scales, RColorBrewer
# =============================================================================

options(stringsAsFactors = FALSE)

# ---- 路径 ----------------------------------------------------------------
args <- commandArgs(trailingOnly = FALSE)
file_arg <- grep("^--file=", args, value = TRUE)
script_path <- if (length(file_arg) > 0) {
  normalizePath(sub("^--file=", "", file_arg[[1]]), mustWork = TRUE)
} else {
  normalizePath("scripts/single-cell/sc06_trajectory_sci.R", mustWork = FALSE)
}
script_dir <- dirname(script_path)
project_root <- if (basename(script_dir) == "scripts" && basename(dirname(script_dir)) == "static") {
  dirname(dirname(script_dir))
} else {
  dirname(script_dir)
}

data_root <- Sys.getenv("BIOF3_DATA_DIR", file.path(path.expand("~"), "biof3-data"))
pbmc_dir  <- file.path(data_root, "pbmc3k")
output_dir <- Sys.getenv(
  "BIOF3_OUTPUT_DIR",
  file.path(project_root, "static", "img", "tutorial", "modules", "module06")
)

dir.create(pbmc_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

pbmc_url <- paste0(
  "https://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/",
  "pbmc3k_filtered_gene_bc_matrices.tar.gz"
)
pbmc_tar <- file.path(pbmc_dir, "pbmc3k_filtered_gene_bc_matrices.tar.gz")
matrix_dir <- file.path(pbmc_dir, "filtered_gene_bc_matrices", "hg19")

# ---- 依赖检查 ------------------------------------------------------------
required_cran <- c("Matrix", "Seurat", "ggplot2", "dplyr", "scales", "RColorBrewer")
required_bioc <- c("SingleCellExperiment", "slingshot")

for (pkg in required_cran) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg, repos = "https://cloud.r-project.org/")
  }
}
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager", repos = "https://cloud.r-project.org/")
}
for (pkg in required_bioc) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    BiocManager::install(pkg, update = FALSE, ask = FALSE)
  }
}

suppressPackageStartupMessages({
  library(Matrix)
  library(Seurat)
  library(SingleCellExperiment)
  library(slingshot)
  library(ggplot2)
  library(dplyr)
  library(scales)
  library(RColorBrewer)
})

set.seed(42)

biof3_colors <- c(
  green = "#0f766e", mint  = "#43d1ae", blue  = "#2563eb",
  amber = "#f59e0b", red   = "#dc2626", slate = "#475569",
  violet = "#7c3aed", pink = "#ec4899"
)

theme_biof3 <- function(base_size = 12) {
  theme_classic(base_size = base_size) +
    theme(
      axis.text  = element_text(color = "#111827"),
      axis.title = element_text(color = "#111827", face = "bold"),
      plot.title = element_text(color = "#12342f", face = "bold", hjust = 0),
      plot.subtitle = element_text(color = "#526760", hjust = 0),
      legend.title = element_text(face = "bold"),
      legend.position = "right",
      strip.background = element_blank(),
      strip.text = element_text(color = "#12342f", face = "bold")
    )
}

# ---- 下载 PBMC 3k -------------------------------------------------------
if (!file.exists(pbmc_tar)) {
  message("下载 PBMC 3k...")
  download.file(pbmc_url, destfile = pbmc_tar, mode = "wb", quiet = FALSE)
}
if (!file.exists(file.path(matrix_dir, "matrix.mtx"))) {
  message("解压 PBMC 3k...")
  utils::untar(pbmc_tar, exdir = pbmc_dir)
}

# ---- 读入矩阵 ------------------------------------------------------------
message("读取 PBMC 3k...")
counts <- Seurat::Read10X(data.dir = matrix_dir)
pbmc <- Seurat::CreateSeuratObject(counts, project = "PBMC3K",
                                   min.cells = 3, min.features = 200)
pbmc[["percent.mt"]] <- Seurat::PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

message("保留 ", ncol(pbmc), " 个细胞 × ", nrow(pbmc), " 个基因")

# 标准 Seurat 流程
pbmc <- NormalizeData(pbmc, verbose = FALSE)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
pbmc <- ScaleData(pbmc, verbose = FALSE)
pbmc <- RunPCA(pbmc, npcs = 30, verbose = FALSE)
pbmc <- FindNeighbors(pbmc, dims = 1:15, verbose = FALSE)
pbmc <- FindClusters(pbmc, resolution = 0.5, verbose = FALSE)
pbmc <- RunUMAP(pbmc, dims = 1:15, verbose = FALSE)

# 粗略注释：用少量经典 marker 把 cluster 映射成大类
marker_signature <- list(
  "B"          = c("MS4A1", "CD79A"),
  "NK"         = c("GNLY", "NKG7"),
  "Monocyte"   = c("CD14", "LYZ"),
  "T CD4"      = c("IL7R", "CCR7", "CD4"),
  "T CD8"      = c("CD8A", "CD8B"),
  "DC"         = c("FCER1A", "CST3"),
  "Platelet"   = c("PPBP")
)
for (ct in names(marker_signature)) {
  pbmc <- AddModuleScore(pbmc, features = list(marker_signature[[ct]]),
                          name = paste0("score_", ct), nbin = 12)
}
score_mat <- pbmc@meta.data[, paste0("score_", names(marker_signature), "1"), drop = FALSE]
colnames(score_mat) <- names(marker_signature)
pbmc$celltype <- colnames(score_mat)[apply(score_mat, 1, which.max)]

# ---- 转成 SCE + Slingshot -----------------------------------------------
message("运行 Slingshot...")
sce <- as.SingleCellExperiment(pbmc)
# 用 Monocyte 作为起点（免疫系统里最接近髓系起源的代理）
sce <- slingshot(sce, clusterLabels = "celltype",
                 reducedDim = "PCA", start.clus = "Monocyte")

pt_mat <- slingPseudotime(sce)
pbmc$pseudotime <- pt_mat[, 1]  # 取主分支作为单一拟时序值
curves <- slingCurves(sce)

# ---- 作图辅助 -----------------------------------------------------------
umap_df <- as.data.frame(Embeddings(pbmc, "umap"))
colnames(umap_df) <- c("UMAP_1", "UMAP_2")
umap_df$cluster  <- pbmc$seurat_clusters
umap_df$celltype <- pbmc$celltype
umap_df$pt       <- pbmc$pseudotime

pca_df <- as.data.frame(Embeddings(pbmc, "pca")[, 1:2])
colnames(pca_df) <- c("PC_1", "PC_2")
pca_df$celltype <- pbmc$celltype
pca_df$pt       <- pbmc$pseudotime

save_plot <- function(p, name, width = 8, height = 5) {
  ggsave(file.path(output_dir, name), p, width = width, height = height, dpi = 300, bg = "white")
}

# ---- 图 1：UMAP 聚类 ----------------------------------------------------
message("图 1：PBMC 3k UMAP 按细胞类型")
p1 <- ggplot(umap_df, aes(UMAP_1, UMAP_2, color = celltype)) +
  geom_point(size = 0.5, alpha = 0.85) +
  scale_color_brewer(palette = "Set2") +
  labs(title = "PBMC 3k UMAP by celltype",
       subtitle = "Cell type assignment based on marker gene module scores",
       color = NULL) +
  theme_biof3() +
  theme(legend.position = "right")
save_plot(p1, "01-umap-celltype.png", width = 8.5, height = 5.5)

# ---- 图 2：PCA 空间上 Slingshot 主曲线 -----------------------------------
message("图 2：PCA 空间的 Slingshot 曲线")
# slingCurves 给的是 PCA 空间的平滑曲线坐标
curve_dfs <- lapply(seq_along(curves), function(i) {
  data.frame(PC_1 = curves[[i]]$s[, 1], PC_2 = curves[[i]]$s[, 2],
             lineage = paste0("Lineage ", i), ord = curves[[i]]$ord)
})
curve_df <- do.call(rbind, curve_dfs)

p2 <- ggplot(pca_df, aes(PC_1, PC_2)) +
  geom_point(aes(color = celltype), size = 0.5, alpha = 0.7) +
  geom_path(data = curve_df,
            aes(group = lineage), color = biof3_colors["slate"],
            linewidth = 1.1) +
  scale_color_brewer(palette = "Set2") +
  labs(title = "Slingshot principal curve in PCA space",
       subtitle = "Curve fit in PCA (black line), then projected back onto UMAP",
       color = NULL) +
  theme_biof3()
save_plot(p2, "02-slingshot-curves-pca.png", width = 8.5, height = 5.5)

# ---- 图 3：UMAP 拟时序梯度 ---------------------------------------------
message("图 3：UMAP 拟时序梯度")
p3 <- ggplot(umap_df, aes(UMAP_1, UMAP_2, color = pt)) +
  geom_point(size = 0.5, alpha = 0.9) +
  scale_color_gradientn(
    colors  = rev(brewer.pal(11, "Spectral")[-6]),
    na.value = "grey80"
  ) +
  labs(title = "Slingshot pseudotime (Lineage 1) on UMAP",
       subtitle = "Darker color = later pseudotime; grey points are not on this lineage",
       color = "pseudotime") +
  theme_biof3()
save_plot(p3, "03-umap-pseudotime.png", width = 8.5, height = 5.5)

# ---- 图 4：拟时序直方图（按 celltype）-----------------------------------
message("图 4：不同细胞类型的拟时序分布")
p4 <- umap_df %>%
  filter(!is.na(pt)) %>%
  ggplot(aes(x = pt, fill = celltype)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.85) +
  scale_fill_brewer(palette = "Set2") +
  facet_wrap(~ celltype, scales = "free_y", ncol = 3) +
  labs(title = "Cell type distribution along Lineage 1",
       subtitle = "A well-fit curve groups cells of the same type into a narrow pseudotime window",
       x = "pseudotime", y = "Cells",
       fill = NULL) +
  theme_biof3() +
  theme(legend.position = "none")
save_plot(p4, "04-pseudotime-by-celltype.png", width = 10, height = 5.5)

# ---- 图 5：选定 marker 沿拟时序变化 -------------------------------------
message("图 5：marker 基因沿拟时序")
trace_genes <- c("CD14", "LYZ", "CD3D", "IL7R", "GNLY", "MS4A1")
trace_genes <- trace_genes[trace_genes %in% rownames(pbmc)]

expr <- Seurat::FetchData(pbmc, vars = trace_genes)
expr$pt <- pbmc$pseudotime
expr$celltype <- pbmc$celltype
expr <- expr[!is.na(expr$pt), ]

expr_long <- tidyr::pivot_longer(
  expr, cols = all_of(trace_genes), names_to = "gene", values_to = "expr"
)
expr_long$gene <- factor(expr_long$gene, levels = trace_genes)

p5 <- ggplot(expr_long, aes(x = pt, y = expr, color = gene)) +
  geom_point(alpha = 0.12, size = 0.3) +
  geom_smooth(method = "loess", span = 0.6, se = FALSE, linewidth = 1) +
  facet_wrap(~ gene, scales = "free_y", ncol = 3) +
  scale_color_brewer(palette = "Dark2") +
  labs(title = "Marker gene expression along Slingshot Lineage 1",
       subtitle = "Points are per-cell expression; curves are LOESS smooths",
       x = "pseudotime", y = "log-normalized expression",
       color = NULL) +
  theme_biof3() +
  theme(legend.position = "none")
save_plot(p5, "05-gene-trends-pseudotime.png", width = 10, height = 6)

# ---- 图 6：拟时序 vs 某个标记的单图（代表性 gene trend）-----------------
message("图 6：单基因 CD14 拟时序变化")
expr_cd14 <- expr[, c("pt", "celltype", intersect("CD14", trace_genes))]
names(expr_cd14)[3] <- "expr"
p6 <- ggplot(expr_cd14, aes(pt, expr)) +
  geom_point(aes(color = celltype), alpha = 0.4, size = 0.6) +
  geom_smooth(method = "loess", span = 0.5, se = TRUE, color = biof3_colors["slate"]) +
  scale_color_brewer(palette = "Set2") +
  labs(title = "CD14 expression along pseudotime",
       subtitle = "Starting at monocytes: CD14 peaks early and decays along T/NK lineages",
       x = "pseudotime", y = "CD14 log-normalized expression",
       color = NULL) +
  theme_biof3()
save_plot(p6, "06-cd14-along-pseudotime.png", width = 9, height = 5.5)

# ---- 输出清单 -----------------------------------------------------------
message("=== 已生成 module06 轨迹图 ===")
print(list.files(output_dir, pattern = "\\.png$", full.names = FALSE))
message("输出目录：", normalizePath(output_dir))
message("PBMC 3k 数据目录：", normalizePath(pbmc_dir))

message("完成。")
sessionInfo()
