#!/usr/bin/env Rscript
# =============================================================================
# BioF3 Genomics Module 03: Variant annotation visualization
# =============================================================================
#
# Uses VariantAnnotation built-in chr22 VCF to demonstrate:
#   1. Variant type distribution (SNV/indel)
#   2. Genomic region annotation (coding/intronic/intergenic)
#   3. Transition/transversion ratio
#   4. Allele frequency spectrum
#   5. Variant density across chr22
#   6. Coding consequence summary
#
# Data: VariantAnnotation::system.file("extdata","chr22.vcf.gz") — built-in
#
# Usage:
#   Rscript scripts/genomics/genome03_variant_anno_sci.R
# =============================================================================

options(stringsAsFactors = FALSE)
set.seed(42)

args <- commandArgs(trailingOnly = FALSE)
file_arg <- grep("^--file=", args, value = TRUE)
script_path <- if (length(file_arg) > 0) {
  normalizePath(sub("^--file=", "", file_arg[[1]]), mustWork = TRUE)
} else {
  normalizePath("scripts/genomics/genome03_variant_anno_sci.R", mustWork = FALSE)
}
script_dir <- dirname(script_path)
project_root <- dirname(dirname(script_dir))
output_dir <- Sys.getenv(
  "BIOF3_OUTPUT_DIR",
  file.path(project_root, "static", "img", "tutorial", "genomics", "genome03")
)
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

suppressPackageStartupMessages({
  library(VariantAnnotation)
  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  library(GenomicRanges)
  library(ggplot2)
  library(dplyr)
  library(patchwork)
})

biof3_colors <- c(green="#0f766e", mint="#43d1ae", blue="#2563eb",
                  amber="#f59e0b", red="#dc2626", slate="#475569",
                  violet="#7c3aed", pink="#ec4899")
theme_biof3 <- function(base_size = 12) {
  theme_classic(base_size = base_size) +
    theme(axis.text = element_text(color = "#111827"),
          axis.title = element_text(color = "#111827", face = "bold"),
          plot.title = element_text(color = "#12342f", face = "bold", hjust = 0),
          plot.subtitle = element_text(color = "#526760", hjust = 0),
          legend.title = element_text(face = "bold"),
          strip.background = element_blank(),
          strip.text = element_text(color = "#12342f", face = "bold"))
}
save_plot <- function(p, name, width = 8, height = 5) {
  ggsave(file.path(output_dir, name), p, width = width, height = height, dpi = 300, bg = "white")
}

# ---- load VCF ------------------------------------------------------------
message("Loading chr22 VCF ...")
fl <- system.file("extdata", "chr22.vcf.gz", package = "VariantAnnotation")
vcf <- readVcf(fl, "hg19")
message("Variants: ", nrow(vcf), "  Samples: ", ncol(vcf))

# basic info
gr <- rowRanges(vcf)
ref_alleles <- as.character(ref(vcf))
alt_alleles <- sapply(alt(vcf), function(x) as.character(x)[1])

# classify SNV vs indel
is_snv <- nchar(ref_alleles) == 1 & nchar(alt_alleles) == 1
var_type <- ifelse(is_snv, "SNV", "Indel")

# ---- figure 1: variant type distribution ---------------------------------
message("Figure 1: variant type distribution")
type_df <- data.frame(type = var_type) |> count(type)
p1 <- ggplot(type_df, aes(x = type, y = n, fill = type)) +
  geom_col(width = 0.5) +
  geom_text(aes(label = n), vjust = -0.3, size = 5) +
  scale_fill_manual(values = c(SNV = biof3_colors[["blue"]], Indel = biof3_colors[["amber"]]), guide = "none") +
  labs(title = "Variant type distribution (chr22)",
       subtitle = paste0("Total: ", nrow(vcf), " variants"),
       x = NULL, y = "Count") +
  theme_biof3()
save_plot(p1, "01-variant-type.png", width = 7, height = 5)

# ---- figure 2: genomic region annotation ---------------------------------
message("Figure 2: genomic region annotation")
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
loc <- locateVariants(vcf, txdb, AllVariants())
loc_tbl <- as.data.frame(table(loc$LOCATION)) |>
  rename(region = Var1, count = Freq) |>
  arrange(desc(count))

p2 <- ggplot(loc_tbl, aes(x = reorder(region, count), y = count, fill = count)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = count), hjust = -0.1, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = biof3_colors[["slate"]], high = biof3_colors[["green"]], guide = "none") +
  labs(title = "Variant distribution by genomic region",
       subtitle = "Most variants fall in introns and intergenic regions",
       x = NULL, y = "Number of variants") +
  theme_biof3()
save_plot(p2, "02-region-annotation.png", width = 9, height = 5.5)

# ---- figure 3: Ti/Tv ratio -----------------------------------------------
message("Figure 3: transition / transversion")
snv_ref <- ref_alleles[is_snv]
snv_alt <- alt_alleles[is_snv]
transitions <- (snv_ref == "A" & snv_alt == "G") | (snv_ref == "G" & snv_alt == "A") |
               (snv_ref == "C" & snv_alt == "T") | (snv_ref == "T" & snv_alt == "C")
ti_count <- sum(transitions)
tv_count <- sum(!transitions)
titv_ratio <- round(ti_count / tv_count, 2)

titv_df <- data.frame(
  type = c("Transition (Ti)", "Transversion (Tv)"),
  count = c(ti_count, tv_count)
)
p3 <- ggplot(titv_df, aes(x = type, y = count, fill = type)) +
  geom_col(width = 0.5) +
  geom_text(aes(label = count), vjust = -0.3, size = 5) +
  scale_fill_manual(values = c(biof3_colors[["green"]], biof3_colors[["violet"]]), guide = "none") +
  labs(title = paste0("Transition / Transversion (Ti/Tv = ", titv_ratio, ")"),
       subtitle = "Expected Ti/Tv ~ 2.0-2.1 for WGS; lower in coding regions",
       x = NULL, y = "Count") +
  theme_biof3()
save_plot(p3, "03-titv.png", width = 8, height = 5)

# ---- figure 4: allele frequency spectrum ---------------------------------
message("Figure 4: allele frequency spectrum")
# use genotype to compute minor allele frequency
gt <- geno(vcf)$GT
# count alt alleles per variant
alt_count <- apply(gt, 1, function(x) {
  sum(grepl("1", x), na.rm = TRUE)
})
total_alleles <- 2 * ncol(gt)
af <- alt_count / total_alleles

af_df <- data.frame(af = af[af > 0 & af < 1])
p4 <- ggplot(af_df, aes(x = af)) +
  geom_histogram(bins = 30, fill = biof3_colors[["blue"]], color = "white", linewidth = 0.2) +
  labs(title = "Site frequency spectrum (chr22)",
       subtitle = "Classic L-shaped distribution: most variants are rare",
       x = "Alternative allele frequency", y = "Number of variants") +
  theme_biof3()
save_plot(p4, "04-af-spectrum.png", width = 9, height = 5)

# ---- figure 5: variant density along chr22 -------------------------------
message("Figure 5: variant density along chr22")
positions <- start(gr)
dens_df <- data.frame(pos = positions / 1e6)
p5 <- ggplot(dens_df, aes(x = pos)) +
  geom_histogram(bins = 50, fill = biof3_colors[["green"]], color = "white", linewidth = 0.2) +
  labs(title = "Variant density along chr22",
       subtitle = "Peaks may correspond to gene-dense regions or segmental duplications",
       x = "Position (Mb)", y = "Number of variants") +
  theme_biof3()
save_plot(p5, "05-density-chr22.png", width = 10, height = 5)

# ---- figure 6: coding consequence types ----------------------------------
message("Figure 6: coding consequence types")
coding_loc <- locateVariants(vcf, txdb, CodingVariants())
if (length(coding_loc) > 0) {
  # predict coding consequences
  coding_vcf <- vcf[unique(queryHits(findOverlaps(rowRanges(vcf), coding_loc)))]
  pred <- tryCatch(
    predictCoding(coding_vcf, txdb, seqSource = Hsapiens),
    error = function(e) NULL
  )
  if (!is.null(pred) && length(pred) > 0) {
    cons_tbl <- as.data.frame(table(pred$CONSEQUENCE)) |>
      rename(consequence = Var1, count = Freq) |>
      arrange(desc(count))
    p6 <- ggplot(cons_tbl, aes(x = reorder(consequence, count), y = count, fill = count)) +
      geom_col(width = 0.6) +
      geom_text(aes(label = count), hjust = -0.1, size = 4) +
      coord_flip() +
      scale_fill_gradient(low = biof3_colors[["slate"]], high = biof3_colors[["red"]], guide = "none") +
      labs(title = "Coding variant consequences",
           subtitle = "Synonymous vs missense vs nonsense distribution",
           x = NULL, y = "Count") +
      theme_biof3()
    save_plot(p6, "06-coding-consequence.png", width = 9, height = 5)
  } else {
    message("  predictCoding requires BSgenome; skipping figure 6")
    # fallback: just show coding variant count
    p6 <- ggplot(data.frame(x = "Coding variants", y = length(coding_loc)),
                 aes(x = x, y = y)) +
      geom_col(fill = biof3_colors[["red"]], width = 0.4) +
      geom_text(aes(label = y), vjust = -0.3, size = 5) +
      labs(title = "Coding variants identified",
           subtitle = "predictCoding requires BSgenome.Hsapiens.UCSC.hg19 for consequence prediction",
           x = NULL, y = "Count") +
      theme_biof3()
    save_plot(p6, "06-coding-consequence.png", width = 7, height = 5)
  }
} else {
  message("  No coding variants found; creating placeholder figure 6")
  p6 <- ggplot(data.frame(x = "No coding variants", y = 0), aes(x, y)) +
    geom_col() + labs(title = "No coding variants in this subset") + theme_biof3()
  save_plot(p6, "06-coding-consequence.png", width = 7, height = 5)
}

message("=== genomics genome03 variant annotation figures ===")
print(list.files(output_dir, pattern = "\\.png$", full.names = FALSE))
message("Done.")
sessionInfo()
