#!/usr/bin/env Rscript
# =============================================================================
# BioF3 Genomics Module 04: maftools tumor mutation analysis (WES)
# =============================================================================
#
# Uses maftools built-in TCGA LAML (acute myeloid leukemia) MAF data
# (193 samples) to demonstrate:
#   1. MAF summary plot (variant classification, type, SNV class)
#   2. Oncoplot (top mutated genes x samples)
#   3. Lollipop plot for a specific gene (DNMT3A)
#   4. Somatic interactions (co-occurrence / mutual exclusivity)
#   5. Tumor mutation burden (TMB) distribution
#   6. Rainfall plot (inter-mutation distance for one sample)
#
# Data: maftools built-in tcga_laml.maf.gz — no download needed
#
# Usage:
#   Rscript scripts/genomics/genome04_maftools_sci.R
# =============================================================================

options(stringsAsFactors = FALSE)
set.seed(42)

args <- commandArgs(trailingOnly = FALSE)
file_arg <- grep("^--file=", args, value = TRUE)
script_path <- if (length(file_arg) > 0) {
  normalizePath(sub("^--file=", "", file_arg[[1]]), mustWork = TRUE)
} else {
  normalizePath("scripts/genomics/genome04_maftools_sci.R", mustWork = FALSE)
}
script_dir <- dirname(script_path)
project_root <- dirname(dirname(script_dir))
output_dir <- Sys.getenv(
  "BIOF3_OUTPUT_DIR",
  file.path(project_root, "static", "img", "tutorial", "genomics", "genome04")
)
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

suppressPackageStartupMessages({
  library(maftools)
})

# ---- load data -----------------------------------------------------------
message("Loading TCGA LAML MAF ...")
laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
laml.clin <- system.file("extdata", "tcga_laml_annot.tsv", package = "maftools")
laml <- read.maf(maf = laml.maf, clinicalData = laml.clin, verbose = FALSE)
message("Samples: ", nrow(getSampleSummary(laml)))
message("Genes mutated: ", nrow(getGeneSummary(laml)))

# ---- figure 1: MAF summary plot -----------------------------------------
message("Figure 1: MAF summary")
png(file.path(output_dir, "01-maf-summary.png"),
    width = 12, height = 8, units = "in", res = 300, bg = "white")
plotmafSummary(laml, rmOutlier = TRUE, addStat = "median", dashboard = TRUE,
              titvRaw = FALSE)
dev.off()

# ---- figure 2: oncoplot (top 10 genes) ----------------------------------
message("Figure 2: oncoplot")
png(file.path(output_dir, "02-oncoplot.png"),
    width = 12, height = 7, units = "in", res = 300, bg = "white")
oncoplot(laml, top = 10, fontSize = 0.7)
dev.off()

# ---- figure 3: lollipop plot (DNMT3A) -----------------------------------
message("Figure 3: lollipop plot (DNMT3A)")
png(file.path(output_dir, "03-lollipop-dnmt3a.png"),
    width = 11, height = 5, units = "in", res = 300, bg = "white")
lollipopPlot(laml, gene = "DNMT3A", AACol = "Protein_Change",
             showMutationRate = TRUE, labelPos = "all")
dev.off()

# ---- figure 4: somatic interactions -------------------------------------
message("Figure 4: somatic interactions")
png(file.path(output_dir, "04-interactions.png"),
    width = 9, height = 8, units = "in", res = 300, bg = "white")
somaticInteractions(laml, top = 15, pvalue = c(0.05, 0.1))
dev.off()

# ---- figure 5: TMB distribution -----------------------------------------
message("Figure 5: TMB distribution")
# calculate variants per sample
sample_summary <- getSampleSummary(laml)
tmb_df <- data.frame(
  sample = sample_summary$Tumor_Sample_Barcode,
  total_variants = sample_summary$total
)

library(ggplot2)
biof3_colors <- c(green="#0f766e", blue="#2563eb", red="#dc2626", slate="#475569")
theme_biof3 <- function(base_size = 12) {
  theme_classic(base_size = base_size) +
    theme(axis.text = element_text(color = "#111827"),
          axis.title = element_text(color = "#111827", face = "bold"),
          plot.title = element_text(color = "#12342f", face = "bold", hjust = 0),
          plot.subtitle = element_text(color = "#526760", hjust = 0),
          strip.background = element_blank(),
          strip.text = element_text(color = "#12342f", face = "bold"))
}

p5 <- ggplot(tmb_df, aes(x = total_variants)) +
  geom_histogram(bins = 30, fill = biof3_colors["blue"], color = "white", linewidth = 0.2) +
  geom_vline(xintercept = median(tmb_df$total_variants), linetype = "dashed",
             color = biof3_colors["red"], linewidth = 0.8) +
  labs(title = "Tumor mutation burden distribution (TCGA LAML)",
       subtitle = paste0("Median: ", median(tmb_df$total_variants),
                         " variants/sample; dashed line = median"),
       x = "Total somatic variants per sample", y = "Number of samples") +
  theme_biof3()
ggsave(file.path(output_dir, "05-tmb-distribution.png"), p5,
       width = 9, height = 5, dpi = 300, bg = "white")

# ---- figure 6: Ti/Tv plot ------------------------------------------------
message("Figure 6: Ti/Tv")
png(file.path(output_dir, "06-titv.png"),
    width = 10, height = 6, units = "in", res = 300, bg = "white")
laml.titv <- titv(laml, plot = FALSE, useSyn = TRUE)
plotTiTv(laml.titv)
dev.off()

message("=== genomics genome04 maftools figures ===")
print(list.files(output_dir, pattern = "\\.png$", full.names = FALSE))
message("Done.")
sessionInfo()
