#!/usr/bin/env Rscript
# =============================================================================
# BioF3 Epigenomics Module 04: Peak visualization and advanced comparison
# =============================================================================
#
# Combines ChIPseeker and DiffBind outputs to produce publication-ready
# epigenomics figures:
#   1. Peak width distribution across samples
#   2. Genomic feature comparison (stacked bar, all 5 samples)
#   3. TSS heatmap-style profile (covplot)
#   4. Upset plot of peak overlaps
#   5. DB sites annotated to genomic features
#   6. Fold-change vs distance-to-TSS scatter for DB sites
#
# Usage:
#   Rscript scripts/epigenomics/epi04_visualization_sci.R
# =============================================================================

options(stringsAsFactors = FALSE)
set.seed(42)

args <- commandArgs(trailingOnly = FALSE)
file_arg <- grep("^--file=", args, value = TRUE)
script_path <- if (length(file_arg) > 0) {
  normalizePath(sub("^--file=", "", file_arg[[1]]), mustWork = TRUE)
} else {
  normalizePath("scripts/epigenomics/epi04_visualization_sci.R", mustWork = FALSE)
}
script_dir <- dirname(script_path)
project_root <- dirname(dirname(script_dir))
output_dir <- Sys.getenv(
  "BIOF3_OUTPUT_DIR",
  file.path(project_root, "static", "img", "tutorial", "epigenomics", "epi04")
)
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

suppressPackageStartupMessages({
  library(ChIPseeker)
  library(DiffBind)
  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  library(org.Hs.eg.db)
  library(ggplot2)
  library(dplyr)
  library(patchwork)
  library(GenomicRanges)
})

biof3_colors <- c(green="#0f766e", mint="#43d1ae", blue="#2563eb",
                  amber="#f59e0b", red="#dc2626", slate="#475569",
                  violet="#7c3aed", pink="#ec4899")
theme_biof3 <- function(base_size = 12) {
  theme_classic(base_size = base_size) +
    theme(axis.text = element_text(color = "#111827"),
          axis.title = element_text(color = "#111827", face = "bold"),
          plot.title = element_text(color = "#12342f", face = "bold", hjust = 0),
          plot.subtitle = element_text(color = "#526760", hjust = 0),
          legend.title = element_text(face = "bold"),
          strip.background = element_blank(),
          strip.text = element_text(color = "#12342f", face = "bold"))
}
save_plot <- function(p, name, width = 8, height = 5) {
  ggsave(file.path(output_dir, name), p, width = width, height = height, dpi = 300, bg = "white")
}

# ---- load ChIPseeker peaks -----------------------------------------------
message("Loading ChIPseeker sample peaks ...")
files <- getSampleFiles()
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

peak_list <- lapply(files, readPeakFile)
names(peak_list) <- names(files)
message("Samples: ", paste(names(peak_list), "=", sapply(peak_list, length), collapse = "; "))

# ---- figure 1: peak width distribution -----------------------------------
message("Figure 1: peak width distribution")
width_df <- lapply(names(peak_list), function(nm) {
  data.frame(sample = nm, width = width(peak_list[[nm]]))
}) |> bind_rows()

p1 <- ggplot(width_df, aes(x = width, fill = sample)) +
  geom_histogram(bins = 60, alpha = 0.7, position = "identity", color = NA) +
  facet_wrap(~ sample, ncol = 1, scales = "free_y") +
  scale_fill_manual(values = unname(biof3_colors)[1:5], guide = "none") +
  scale_x_continuous(limits = c(0, 3000)) +
  labs(title = "Peak width distribution",
       subtitle = "Narrow peaks (TF binding) typically 100-500 bp; broader = histone marks",
       x = "Peak width (bp)", y = "Count") +
  theme_biof3()
save_plot(p1, "01-peak-width.png", width = 9, height = 10)

# ---- figure 2: genomic feature comparison (all 5 samples) ----------------
message("Figure 2: genomic feature comparison (all samples)")
anno_list <- lapply(peak_list, annotatePeak, TxDb = txdb, tssRegion = c(-3000, 3000),
                    verbose = FALSE)
p2 <- plotAnnoBar(anno_list) +
  labs(title = "Genomic feature distribution across all samples",
       subtitle = "AR samples (dose-response) vs CBX6/CBX7 (different TFs)") +
  theme_biof3()
save_plot(p2, "02-feature-all.png", width = 11, height = 5.5)

# ---- figure 3: TSS coverage profile (covplot) ----------------------------
message("Figure 3: TSS coverage profile")
p3 <- covplot(peak_list[["ARmo_100nM"]]) +
  labs(title = "Peak coverage across genome (AR 100nM)",
       subtitle = "Each vertical line = one peak; density shows clustering") +
  theme_biof3()
save_plot(p3, "03-covplot.png", width = 12, height = 5)

# ---- figure 4: peak overlap upset-style ----------------------------------
message("Figure 4: peak overlap across samples")
# find overlaps between AR doses using GenomicRanges
gr1 <- peak_list[["ARmo_0M"]]
gr2 <- peak_list[["ARmo_1nM"]]
gr3 <- peak_list[["ARmo_100nM"]]

# for each peak in each sample, check if it overlaps with peaks in other samples
in1 <- seq_along(gr1)
in2 <- seq_along(gr2)
in3 <- seq_along(gr3)

# merge all peaks into a consensus set
all_peaks <- reduce(c(gr1, gr2, gr3))
# count which samples overlap each consensus peak
ov1 <- overlapsAny(all_peaks, gr1)
ov2 <- overlapsAny(all_peaks, gr2)
ov3 <- overlapsAny(all_peaks, gr3)

pattern <- paste0(
  ifelse(ov1, "AR_0M", ""),
  ifelse(ov1 & (ov2 | ov3), " + ", ""),
  ifelse(ov2, "AR_1nM", ""),
  ifelse(ov2 & ov3, " + ", ""),
  ifelse(ov3, "AR_100nM", "")
)
# clean up patterns
pattern <- gsub("^ \\+ | \\+ $", "", pattern)
pattern <- gsub(" \\+  \\+ ", " + ", pattern)
counts <- sort(table(pattern), decreasing = TRUE)
vc_df <- data.frame(pattern = names(counts), Counts = as.integer(counts))
vc_df <- vc_df[vc_df$pattern != "", ]

p4 <- ggplot(vc_df, aes(x = reorder(pattern, Counts), y = Counts, fill = Counts)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = Counts), hjust = -0.1, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = biof3_colors[["slate"]], high = biof3_colors[["red"]], guide = "none") +
  labs(title = "Peak overlap across AR doses (consensus regions)",
       subtitle = "Shared peaks = consistent binding; dose-specific = dose-dependent",
       x = NULL, y = "Number of consensus regions") +
  theme_biof3()
save_plot(p4, "04-peak-overlap.png", width = 10, height = 5)

# ---- DiffBind for figures 5-6 -------------------------------------------
message("Loading DiffBind tamoxifen for DB site annotation ...")
data(tamoxifen_counts)
tamoxifen <- dba.contrast(tamoxifen, categories = DBA_CONDITION)
tamoxifen <- dba.analyze(tamoxifen, method = DBA_DESEQ2)
db_report <- dba.report(tamoxifen)
message("DB sites: ", length(db_report))

# ---- figure 5: annotate DB sites to genomic features ---------------------
message("Figure 5: DB sites genomic annotation")
db_anno <- annotatePeak(db_report, TxDb = txdb, tssRegion = c(-3000, 3000), verbose = FALSE)
png(file.path(output_dir, "05-db-anno-pie.png"), width = 9, height = 6, units = "in", res = 300, bg = "white")
plotAnnoPie(db_anno)
title(main = "Differential binding sites: genomic annotation", font.main = 2, col.main = "#12342f")
dev.off()

# ---- figure 6: fold-change vs distance to TSS ----------------------------
message("Figure 6: FC vs distance to TSS")
db_df <- as.data.frame(db_anno)
if ("Fold" %in% colnames(db_df) && "distanceToTSS" %in% colnames(db_df)) {
  p6 <- ggplot(db_df, aes(x = distanceToTSS / 1000, y = Fold,
                          color = ifelse(Fold > 0, "Gained", "Lost"))) +
    geom_point(size = 1.5, alpha = 0.7) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
    scale_color_manual(values = c(Gained = biof3_colors[["red"]], Lost = biof3_colors[["blue"]])) +
    labs(title = "Differential binding: fold change vs distance to TSS",
         subtitle = "Promoter-proximal DB sites (near 0) vs distal enhancer-like sites",
         x = "Distance to nearest TSS (kb)", y = "log2 fold change",
         color = "Direction") +
    theme_biof3()
  save_plot(p6, "06-fc-vs-tss.png", width = 10, height = 6)
}

message("=== epigenomics epi04 visualization figures ===")
print(list.files(output_dir, pattern = "\\.png$", full.names = FALSE))
message("Done.")
sessionInfo()
