#!/usr/bin/env Rscript
# =============================================================================
# BioF3 Epigenomics Module 02: ChIPseeker peak annotation + comparison
# =============================================================================
#
# Uses ChIPseeker built-in sample peak files (AR ChIP-seq at 3 doses +
# CBX6/CBX7 ChIP-seq) to demonstrate:
#   1. Peak annotation to genomic features (promoter/intron/intergenic)
#   2. TSS distance distribution
#   3. Multi-sample peak overlap (Venn)
#   4. Functional enrichment of peak-associated genes
#   5. Peak coverage across chromosomes
#   6. Comparison of genomic feature distribution across samples
#
# Data source:
#   ChIPseeker::getSampleFiles() — built-in, no download needed
#
# Usage:
#   Rscript scripts/epigenomics/epi02_chipseeker_sci.R
# =============================================================================

options(stringsAsFactors = FALSE)
set.seed(42)

args <- commandArgs(trailingOnly = FALSE)
file_arg <- grep("^--file=", args, value = TRUE)
script_path <- if (length(file_arg) > 0) {
  normalizePath(sub("^--file=", "", file_arg[[1]]), mustWork = TRUE)
} else {
  normalizePath("scripts/epigenomics/epi02_chipseeker_sci.R", mustWork = FALSE)
}
script_dir <- dirname(script_path)
project_root <- dirname(dirname(script_dir))
output_dir <- Sys.getenv(
  "BIOF3_OUTPUT_DIR",
  file.path(project_root, "static", "img", "tutorial", "epigenomics", "epi02")
)
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

suppressPackageStartupMessages({
  library(ChIPseeker)
  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  library(clusterProfiler)
  library(org.Hs.eg.db)
  library(ggplot2)
  library(dplyr)
  library(patchwork)
  library(GenomicRanges)
})

biof3_colors <- c(green="#0f766e", mint="#43d1ae", blue="#2563eb",
                  amber="#f59e0b", red="#dc2626", slate="#475569",
                  violet="#7c3aed", pink="#ec4899")
theme_biof3 <- function(base_size = 12) {
  theme_classic(base_size = base_size) +
    theme(axis.text = element_text(color = "#111827"),
          axis.title = element_text(color = "#111827", face = "bold"),
          plot.title = element_text(color = "#12342f", face = "bold", hjust = 0),
          plot.subtitle = element_text(color = "#526760", hjust = 0),
          legend.title = element_text(face = "bold"),
          strip.background = element_blank(),
          strip.text = element_text(color = "#12342f", face = "bold"))
}
save_plot <- function(p, name, width = 8, height = 5) {
  ggsave(file.path(output_dir, name), p, width = width, height = height, dpi = 300, bg = "white")
}

# ---- load peaks ----------------------------------------------------------
message("Loading sample peak files ...")
files <- getSampleFiles()
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

# Use AR 0M, 1nM, 100nM for dose-response comparison
peak_list <- lapply(files[1:3], readPeakFile)
names(peak_list) <- c("AR_0M", "AR_1nM", "AR_100nM")
message("Peak counts: ", paste(names(peak_list), "=", sapply(peak_list, length), collapse = "; "))

# ---- annotate all peaks --------------------------------------------------
message("Annotating peaks ...")
anno_list <- lapply(peak_list, annotatePeak, TxDb = txdb, tssRegion = c(-3000, 3000),
                    verbose = FALSE)

# ---- figure 1: genomic feature distribution (pie) for AR_100nM -----------
message("Figure 1: genomic feature pie (AR_100nM)")
png(file.path(output_dir, "01-anno-pie.png"), width = 9, height = 6, units = "in", res = 300, bg = "white")
plotAnnoPie(anno_list[["AR_100nM"]])
title(main = "Peak annotation: AR ChIP-seq 100nM", font.main = 2, col.main = "#12342f")
dev.off()

# ---- figure 2: TSS distance distribution --------------------------------
message("Figure 2: TSS distance distribution")
p2 <- plotDistToTSS(anno_list, title = "Distance to TSS across AR doses") +
  theme_biof3() +
  labs(subtitle = "Peaks closer to TSS are more likely promoter-bound")
save_plot(p2, "02-tss-distance.png", width = 10, height = 5.5)

# ---- figure 3: comparison of genomic feature across samples --------------
message("Figure 3: feature comparison bar")
p3 <- plotAnnoBar(anno_list) +
  labs(title = "Genomic feature distribution across AR doses",
       subtitle = "Higher dose = more peaks in promoter regions (AR is a TF)") +
  theme_biof3()
save_plot(p3, "03-anno-bar.png", width = 10, height = 5)

# ---- figure 4: peak overlap Venn ----------------------------------------
message("Figure 4: peak overlap Venn")
# Use GenomicRanges to find overlaps
genes_list <- lapply(anno_list, function(a) {
  df <- as.data.frame(a)
  unique(df$geneId[!is.na(df$geneId)])
})

png(file.path(output_dir, "04-gene-venn.png"), width = 8, height = 6, units = "in", res = 300, bg = "white")
venn_data <- list(
  AR_0M = genes_list[["AR_0M"]],
  AR_1nM = genes_list[["AR_1nM"]],
  AR_100nM = genes_list[["AR_100nM"]]
)
# simple upset-style bar
all_genes <- unique(unlist(venn_data))
in_mat <- sapply(venn_data, function(x) all_genes %in% x)
pattern <- apply(in_mat, 1, function(x) paste(names(venn_data)[x], collapse = " + "))
counts <- sort(table(pattern), decreasing = TRUE)
par(mar = c(5, 12, 3, 2))
barplot(counts, horiz = TRUE, las = 1, col = biof3_colors["green"],
        main = "Peak-associated gene overlap across AR doses",
        xlab = "Number of genes", cex.names = 0.8)
dev.off()

# ---- figure 5: GO enrichment of AR_100nM peak genes ---------------------
message("Figure 5: GO enrichment of peak-associated genes")
genes_100nM <- genes_list[["AR_100nM"]]
ego <- enrichGO(gene = genes_100nM, OrgDb = org.Hs.eg.db, keyType = "ENTREZID",
                ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.05, readable = TRUE)
if (!is.null(ego) && nrow(ego@result) > 0) {
  ego <- tryCatch(simplify(ego, cutoff = 0.7), error = function(e) ego)
  p5 <- dotplot(ego, showCategory = 12, label_format = 50) +
    scale_color_gradient(low = biof3_colors["red"], high = biof3_colors["blue"]) +
    labs(title = "GO BP enrichment: AR 100nM peak-associated genes",
         subtitle = "Genes nearest to AR binding peaks at highest dose") +
    theme_biof3()
  save_plot(p5, "05-go-enrichment.png", width = 10, height = 7)
}

# ---- figure 6: coverage plot (peak count per chromosome) -----------------
message("Figure 6: peak coverage by chromosome")
chr_df <- lapply(names(peak_list), function(nm) {
  gr <- peak_list[[nm]]
  tbl <- table(as.character(seqnames(gr)))
  data.frame(chr = names(tbl), n = as.integer(tbl), sample = nm)
}) |> bind_rows() |>
  filter(grepl("^chr[0-9]+$|^chrX$|^chrY$", chr)) |>
  mutate(chr = factor(chr, levels = paste0("chr", c(1:22, "X", "Y"))))

p6 <- ggplot(chr_df, aes(x = chr, y = n, fill = sample)) +
  geom_col(position = position_dodge(width = 0.7), width = 0.65) +
  scale_fill_manual(values = c(AR_0M = biof3_colors[["slate"]],
                               AR_1nM = biof3_colors[["amber"]],
                               AR_100nM = biof3_colors[["red"]])) +
  labs(title = "Peak count per chromosome across AR doses",
       subtitle = "Higher dose = more peaks genome-wide; distribution roughly proportional to chr size",
       x = NULL, y = "Number of peaks", fill = "Sample") +
  theme_biof3() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))
save_plot(p6, "06-chr-coverage.png", width = 12, height = 5.5)

message("=== epigenomics epi02 ChIPseeker figures ===")
print(list.files(output_dir, pattern = "\\.png$", full.names = FALSE))
message("Done.")
sessionInfo()
