#!/usr/bin/env Rscript
# ============================================================================
# BioF3 Module 03: Cell Ranger Output QC with Real PBMC 3k Data
# ============================================================================

options(stringsAsFactors = FALSE)

args <- commandArgs(trailingOnly = FALSE)
file_arg <- grep("^--file=", args, value = TRUE)
script_path <- if (length(file_arg) > 0) {
  normalizePath(sub("^--file=", "", file_arg[[1]]), mustWork = TRUE)
} else {
  normalizePath("scripts/single-cell/sc03_qc_cluster_sci.R", mustWork = FALSE)
}

script_dir <- dirname(script_path)
project_root <- if (basename(script_dir) == "scripts" && basename(dirname(script_dir)) == "static") {
  dirname(dirname(script_dir))
} else {
  dirname(script_dir)
}

data_root <- Sys.getenv("BIOF3_DATA_DIR", file.path(path.expand("~"), "biof3-data"))
pbmc_dir <- file.path(data_root, "pbmc3k")
output_dir <- Sys.getenv("BIOF3_OUTPUT_DIR", file.path(project_root, "static", "img", "tutorial", "modules", "module03"))

dir.create(pbmc_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

required_packages <- c("Matrix", "ggplot2", "dplyr", "tidyr", "patchwork", "scales")
for (pkg in required_packages) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg, repos = "https://cloud.r-project.org/")
  }
}

library(Matrix)
library(ggplot2)
library(dplyr)
library(tidyr)
library(patchwork)
library(scales)

theme_sci <- function(base_size = 12) {
  theme_classic(base_size = base_size) +
    theme(
      axis.text = element_text(color = "black"),
      axis.title = element_text(face = "bold", color = "black"),
      plot.title = element_text(face = "bold", hjust = 0.5, color = "black"),
      plot.subtitle = element_text(color = "gray30", hjust = 0.5),
      legend.title = element_text(face = "bold"),
      plot.background = element_rect(fill = "white", color = NA),
      strip.background = element_blank(),
      strip.text = element_text(face = "bold", color = "black")
    )
}

pbmc_url <- paste0(
  "https://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/",
  "pbmc3k_filtered_gene_bc_matrices.tar.gz"
)
pbmc_tar <- file.path(pbmc_dir, "pbmc3k_filtered_gene_bc_matrices.tar.gz")
matrix_dir <- file.path(pbmc_dir, "filtered_gene_bc_matrices", "hg19")

download_pbmc3k <- function() {
  if (!file.exists(pbmc_tar)) {
    message("Downloading PBMC 3k data from 10x Genomics...")
    download.file(pbmc_url, destfile = pbmc_tar, mode = "wb", quiet = FALSE)
  }
  if (!file.exists(file.path(matrix_dir, "matrix.mtx"))) {
    message("Extracting PBMC 3k archive...")
    utils::untar(pbmc_tar, exdir = pbmc_dir)
  }
}

read_pbmc3k <- function() {
  download_pbmc3k()
  counts <- Matrix::readMM(file.path(matrix_dir, "matrix.mtx"))
  genes <- read.delim(file.path(matrix_dir, "genes.tsv"), header = FALSE, sep = "\t")
  barcodes <- read.delim(file.path(matrix_dir, "barcodes.tsv"), header = FALSE, sep = "\t")
  rownames(counts) <- make.unique(genes[[2]])
  colnames(counts) <- barcodes[[1]]
  as(counts, "CsparseMatrix")
}

message("=== Loading real PBMC 3k matrix ===")
counts <- read_pbmc3k()

mt_genes <- grepl("^MT-", rownames(counts))
n_counts <- Matrix::colSums(counts)
n_features <- Matrix::colSums(counts > 0)
percent_mt <- Matrix::colSums(counts[mt_genes, , drop = FALSE]) / n_counts * 100

qc <- data.frame(
  cell = colnames(counts),
  nCount_RNA = as.numeric(n_counts),
  nFeature_RNA = as.numeric(n_features),
  percent_mt = as.numeric(percent_mt)
) %>%
  arrange(cell) %>%
  mutate(
    pass_qc = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent_mt < 5,
    barcode_group = ntile(row_number(), 4),
    barcode_group = factor(paste("Barcode subset", barcode_group), levels = paste("Barcode subset", 1:4))
  )

gene_detected <- Matrix::rowSums(counts > 0)
gene_summary <- data.frame(
  gene = rownames(counts),
  total_counts = as.numeric(Matrix::rowSums(counts)),
  detected_cells = as.numeric(gene_detected)
)

message("=== Generating real-data figures ===")

summary_metrics <- data.frame(
  metric = c("Filtered cells", "Genes in matrix", "Detected genes", "Median UMIs", "Median genes", "Median mito %"),
  value = c(
    ncol(counts),
    nrow(counts),
    sum(gene_detected > 0),
    median(qc$nCount_RNA),
    median(qc$nFeature_RNA),
    median(qc$percent_mt)
  )
) %>%
  mutate(metric = factor(metric, levels = rev(metric)))

p1 <- ggplot(summary_metrics, aes(x = value, y = metric, fill = metric)) +
  geom_col(width = 0.7, color = "black", linewidth = 0.25) +
  geom_text(aes(label = comma(round(value, 1))), hjust = -0.08, size = 3.5, fontface = "bold") +
  scale_x_continuous(labels = comma, expand = expansion(mult = c(0, 0.18))) +
  scale_fill_manual(values = rep(c("#00A087", "#3C5488", "#4DBBD5", "#E64B35", "#8491B4", "#F39B7F"), length.out = nrow(summary_metrics))) +
  labs(title = "PBMC 3k filtered matrix summary", x = "Value", y = NULL) +
  theme_sci() +
  theme(legend.position = "none")
ggsave(file.path(output_dir, "01-workflow-efficiency.png"), p1, width = 10, height = 6, dpi = 300, bg = "white")

fastq_structure <- data.frame(
  component = factor(c("Cell barcode", "UMI", "Poly-T", "cDNA"), levels = c("Cell barcode", "UMI", "Poly-T", "cDNA")),
  length = c(16, 10, 10, 90),
  read = c("Read 1", "Read 1", "Read 1", "Read 2")
)

p2 <- ggplot(fastq_structure, aes(x = component, y = length, fill = read)) +
  geom_col(width = 0.6, color = "black", linewidth = 0.25) +
  geom_text(aes(label = paste0(length, " bp")), vjust = -0.45, size = 4, fontface = "bold") +
  scale_fill_manual(values = c("Read 1" = "#E64B35", "Read 2" = "#4DBBD5")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
  labs(title = "10x 3' gene expression read structure", x = "Component", y = "Length (bp)", fill = "Read") +
  theme_sci()
ggsave(file.path(output_dir, "02-fastq-structure.png"), p2, width = 8, height = 6, dpi = 300, bg = "white")

pipeline_data <- data.frame(
  stage = factor(
    c("Reference genes", "Detected genes", "Filtered cells", "QC-retained cells"),
    levels = c("Reference genes", "Detected genes", "Filtered cells", "QC-retained cells")
  ),
  value = c(nrow(counts), sum(gene_detected > 0), ncol(counts), sum(qc$pass_qc))
)

p3 <- ggplot(pipeline_data, aes(x = stage, y = value, group = 1)) +
  geom_line(color = "#00A087", linewidth = 1.3) +
  geom_point(size = 4, color = "#00A087", fill = "white", shape = 21, stroke = 1.5) +
  geom_text(aes(label = comma(value)), vjust = -1.1, size = 3.5, fontface = "bold") +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0.04, 0.18))) +
  labs(title = "Cell Ranger matrix output checkpoints", x = "Checkpoint", y = "Count") +
  theme_sci() +
  theme(axis.text.x = element_text(angle = 25, hjust = 1))
ggsave(file.path(output_dir, "03-pipeline-stages.png"), p3, width = 9, height = 6, dpi = 300, bg = "white")

p4a <- ggplot(qc, aes(x = nFeature_RNA)) +
  geom_histogram(bins = 45, fill = "#3C5488", color = "white") +
  geom_vline(xintercept = c(200, 2500), linetype = "dashed", color = "red", linewidth = 0.8) +
  labs(x = "Detected genes", y = "Cells") +
  theme_sci()

p4b <- ggplot(qc, aes(x = nCount_RNA)) +
  geom_histogram(bins = 45, fill = "#00A087", color = "white") +
  geom_vline(xintercept = median(qc$nCount_RNA), linetype = "dashed", color = "blue", linewidth = 0.8) +
  scale_x_continuous(labels = comma) +
  labs(x = "UMI counts", y = "Cells") +
  theme_sci()

p4c <- ggplot(qc, aes(x = percent_mt)) +
  geom_histogram(bins = 45, fill = "#E64B35", color = "white") +
  geom_vline(xintercept = 5, linetype = "dashed", color = "red", linewidth = 0.8) +
  labs(x = "Mitochondrial %", y = "Cells") +
  theme_sci()

p4 <- (p4a | p4b | p4c) + plot_annotation(title = "PBMC 3k quality control metric distributions")
ggsave(file.path(output_dir, "04-qc-metrics.png"), p4, width = 14, height = 4.8, dpi = 300, bg = "white")

cell_steps <- unique(round(seq(50, ncol(counts), length.out = 20)))
cumulative <- data.frame(
  cells = cell_steps,
  genes_detected = vapply(cell_steps, function(n) sum(Matrix::rowSums(counts[, seq_len(n), drop = FALSE] > 0) > 0), numeric(1))
)

p5 <- ggplot(cumulative, aes(x = cells, y = genes_detected)) +
  geom_line(color = "#4DBBD5", linewidth = 1.3) +
  geom_point(size = 2.8, color = "#4DBBD5") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(title = "Cumulative genes detected from real PBMC 3k cells", x = "Cells included", y = "Genes detected") +
  theme_sci()
ggsave(file.path(output_dir, "05-saturation-curve.png"), p5, width = 9, height = 6, dpi = 300, bg = "white")

detection_summary <- data.frame(
  category = c("Detected in >= 10 cells", "Detected in 1-9 cells", "Not detected"),
  genes = c(sum(gene_detected >= 10), sum(gene_detected > 0 & gene_detected < 10), sum(gene_detected == 0))
)

p6 <- ggplot(detection_summary, aes(x = "", y = genes, fill = category)) +
  geom_col(width = 1, color = "white", linewidth = 1) +
  coord_polar(theta = "y") +
  geom_text(aes(label = comma(genes)), position = position_stack(vjust = 0.5), color = "white", fontface = "bold") +
  scale_fill_manual(values = c("#00A087", "#3C5488", "#E64B35")) +
  labs(title = "PBMC 3k gene detection summary", fill = "Category") +
  theme_void() +
  theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5), legend.title = element_text(face = "bold"))
ggsave(file.path(output_dir, "06-alignment-stats.png"), p6, width = 8, height = 6, dpi = 300, bg = "white")

p7 <- ggplot(cumulative, aes(x = cells, y = genes_detected)) +
  geom_line(color = "#F39B7F", linewidth = 1.3) +
  geom_point(size = 3.2, color = "#F39B7F", fill = "white", shape = 21, stroke = 1.4) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(title = "Genes detected vs number of real cells", x = "Number of cells", y = "Total genes detected") +
  theme_sci()
ggsave(file.path(output_dir, "07-cells-vs-genes.png"), p7, width = 9, height = 6, dpi = 300, bg = "white")

p8 <- ggplot(qc, aes(x = nCount_RNA)) +
  geom_histogram(aes(y = after_stat(density)), bins = 60, fill = "#8491B4", color = "white") +
  geom_density(color = "#E64B35", linewidth = 1.2) +
  geom_vline(xintercept = median(qc$nCount_RNA), linetype = "dashed", color = "blue", linewidth = 0.9) +
  annotate("text", x = median(qc$nCount_RNA) * 1.35, y = max(density(qc$nCount_RNA)$y) * 0.8, label = paste0("Median: ", comma(round(median(qc$nCount_RNA)))), color = "blue", fontface = "bold") +
  scale_x_continuous(labels = comma) +
  labs(title = "PBMC 3k UMI count distribution", x = "UMI counts per cell", y = "Density") +
  theme_sci()
ggsave(file.path(output_dir, "08-umi-distribution.png"), p8, width = 9, height = 6, dpi = 300, bg = "white")

group_summary <- qc %>%
  group_by(barcode_group) %>%
  summarise(
    Cells = n(),
    `Median genes` = median(nFeature_RNA),
    `Median UMIs` = median(nCount_RNA),
    .groups = "drop"
  ) %>%
  pivot_longer(-barcode_group, names_to = "metric", values_to = "value") %>%
  group_by(metric) %>%
  mutate(normalized = value / value[barcode_group == "Barcode subset 1"] * 100) %>%
  ungroup()

p9 <- ggplot(group_summary, aes(x = barcode_group, y = normalized, fill = metric)) +
  geom_col(position = "dodge", color = "black", linewidth = 0.25, width = 0.7) +
  geom_hline(yintercept = 100, linetype = "dashed", color = "gray40") +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  scale_fill_manual(values = c("#00A087", "#3C5488", "#F39B7F")) +
  labs(title = "PBMC 3k barcode subset QC comparison", x = "Real barcode subset", y = "Normalized value", fill = "Metric") +
  theme_sci() +
  theme(axis.text.x = element_text(angle = 20, hjust = 1), legend.position = "top")
ggsave(file.path(output_dir, "09-sample-comparison.png"), p9, width = 10, height = 6, dpi = 300, bg = "white")

dashboard_metrics <- data.frame(
  metric = c("Cells", "QC-retained cells", "Median genes", "Median UMIs", "Detected genes"),
  value = c(ncol(counts), sum(qc$pass_qc), median(qc$nFeature_RNA), median(qc$nCount_RNA), sum(gene_detected > 0)),
  target = c(2500, 2000, 500, 1000, 12000)
) %>%
  mutate(status = value >= target)

p10a <- ggplot(dashboard_metrics, aes(x = metric, y = value, fill = status)) +
  geom_col(color = "black", linewidth = 0.25, width = 0.65) +
  scale_y_continuous(labels = comma) +
  scale_fill_manual(values = c("FALSE" = "#E64B35", "TRUE" = "#00A087")) +
  labs(x = NULL, y = "Value", fill = "Pass target") +
  theme_sci() +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

p10b <- ggplot(qc, aes(x = nCount_RNA, y = nFeature_RNA, color = percent_mt)) +
  geom_point(size = 1.2, alpha = 0.65) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  scale_color_gradient(low = "#00A087", high = "#E64B35") +
  labs(x = "UMI counts", y = "Detected genes", color = "Mito %") +
  theme_sci()

p10 <- (p10a | p10b) + plot_annotation(title = "PBMC 3k quality control dashboard")
ggsave(file.path(output_dir, "10-quality-dashboard.png"), p10, width = 14, height = 6, dpi = 300, bg = "white")

message("=== Generated Module 03 real-data figures ===")
print(list.files(output_dir, pattern = "\\.png$", full.names = FALSE))
message("Output directory: ", normalizePath(output_dir))
sessionInfo()
