#!/usr/bin/env Rscript
# =============================================================================
# BioF3 one-shot data & package bootstrap
# =============================================================================
#
# Installs every R package referenced by the BioF3 companion scripts, then
# downloads the three public 10x datasets that cannot be shipped with a
# Bioconductor package into ~/biof3-data/ (or $BIOF3_DATA_DIR).
#
# Every step is idempotent:
#   - installed packages are skipped
#   - already-downloaded data are not re-fetched
#
# Usage:
#   Rscript scripts/biof3_prepare_data.R
#
# Optional env vars:
#   BIOF3_DATA_DIR=/path/to/data   default: ~/biof3-data
#
# =============================================================================

options(stringsAsFactors = FALSE, timeout = 900)

# ---- logging helpers ----------------------------------------------------
.print_banner <- function(text) {
  line <- paste(rep("=", 70), collapse = "")
  cat("\n", line, "\n", text, "\n", line, "\n", sep = "")
}
.step <- function(text) cat("\n>>> ", text, "\n", sep = "")
.ok   <- function(text) cat("    [ok] ", text, "\n", sep = "")
.warn <- function(text) cat("    [!!] ", text, "\n", sep = "")

# ---- R version check ----------------------------------------------------
.print_banner("BioF3 data & package bootstrap")
.step(paste0("R version: ", getRversion()))
if (getRversion() < "4.2") {
  stop("R >= 4.2 is recommended. Please upgrade.")
}

# ---- install helpers ----------------------------------------------------
install_if_missing <- function(pkg, source = c("cran", "bioc", "github"),
                               github_repo = NULL) {
  source <- match.arg(source)
  if (requireNamespace(pkg, quietly = TRUE)) {
    .ok(paste0(pkg, " already installed"))
    return(TRUE)
  }
  .step(paste0("installing ", pkg, " (", source, ")"))
  ok <- tryCatch({
    if (source == "cran") {
      install.packages(pkg)
    } else if (source == "bioc") {
      if (!requireNamespace("BiocManager", quietly = TRUE)) {
        install.packages("BiocManager")
      }
      BiocManager::install(pkg, update = FALSE, ask = FALSE)
    } else if (source == "github") {
      if (!requireNamespace("devtools", quietly = TRUE)) {
        install.packages("devtools")
      }
      devtools::install_github(github_repo, upgrade = "never")
    }
    requireNamespace(pkg, quietly = TRUE)
  }, error = function(e) {
    .warn(paste0(pkg, " failed via primary source: ", conditionMessage(e)))
    FALSE
  })
  if (!ok) {
    # Bioconductor fallback with explicit repos (works when BiocManager
    # version validation keeps failing).
    if (source == "bioc") {
      .step(paste0("retrying ", pkg, " via explicit Bioconductor mirrors"))
      repos <- c(
        BioCsoft = "https://bioconductor.org/packages/3.22/bioc",
        BioCann  = "https://bioconductor.org/packages/3.22/data/annotation",
        BioCexp  = "https://bioconductor.org/packages/3.22/data/experiment",
        CRAN     = "https://mirrors.tuna.tsinghua.edu.cn/CRAN"
      )
      ok <- tryCatch({
        install.packages(pkg, repos = repos, dependencies = TRUE)
        requireNamespace(pkg, quietly = TRUE)
      }, error = function(e) {
        .warn(paste0(pkg, " failed on fallback: ", conditionMessage(e)))
        FALSE
      })
    }
  }
  if (ok) .ok(paste0(pkg, " ok")) else .warn(paste0(pkg, " installation failed — please retry manually"))
  invisible(ok)
}

# ---- CRAN packages ------------------------------------------------------
.print_banner("Installing CRAN packages")
cran_pkgs <- c(
  "Matrix", "dplyr", "tidyr", "readr", "stringr", "tibble",
  "ggplot2", "ggrepel", "patchwork", "pheatmap", "RColorBrewer",
  "viridis", "BiocManager", "devtools", "remotes",
  "Seurat", "SeuratObject",
  "sna", "ggnetwork", "collapse",        # CellChat runtime deps
  "hash"                                # scRepertoire runtime dep
)
for (p in cran_pkgs) install_if_missing(p, source = "cran")

# ---- Bioconductor packages ----------------------------------------------
.print_banner("Installing Bioconductor packages")
bioc_pkgs <- c(
  # single-cell
  "SingleCellExperiment", "SummarizedExperiment", "GenomicRanges",
  "slingshot", "tradeSeq",
  "scRepertoire",
  "Signac", "EnsDb.Hsapiens.v86",
  # bulk RNA-seq
  "DESeq2", "edgeR", "limma", "sva",
  "clusterProfiler", "org.Hs.eg.db", "AnnotationDbi",
  "enrichplot", "DOSE", "fgsea",
  "airway", "fission", "bladderbatch", "pasilla",
  "apeglm", "RUVSeq"
)
for (p in bioc_pkgs) install_if_missing(p, source = "bioc")

# ---- GitHub-only packages -----------------------------------------------
.print_banner("Installing GitHub-only packages")
install_if_missing("CellChat", source = "github", github_repo = "jinworks/CellChat")
install_if_missing("SeuratData", source = "github", github_repo = "satijalab/seurat-data")

# SeuratData companion data package
tryCatch({
  if (requireNamespace("SeuratData", quietly = TRUE)) {
    installed <- SeuratData::InstalledData()
    if (!"stxBrain" %in% installed$Dataset) {
      .step("installing SeuratData::stxBrain (~136 MB)")
      SeuratData::InstallData("stxBrain")
      .ok("stxBrain installed")
    } else {
      .ok("stxBrain already installed")
    }
  }
}, error = function(e) {
  .warn(paste0("stxBrain install failed: ", conditionMessage(e)))
})

# ---- data directory layout ----------------------------------------------
.print_banner("Setting up data directory")
data_root <- Sys.getenv("BIOF3_DATA_DIR",
                        file.path(path.expand("~"), "biof3-data"))
dir.create(data_root, recursive = TRUE, showWarnings = FALSE)
.ok(paste0("data root: ", normalizePath(data_root)))

# ---- dataset catalog ----------------------------------------------------
datasets <- list(
  pbmc3k = list(
    subdir = "pbmc3k",
    tarball = "pbmc3k.tar.gz",
    url = paste0(
      "https://cf.10xgenomics.com/samples/cell/pbmc3k/",
      "pbmc3k_filtered_gene_bc_matrices.tar.gz"
    ),
    size_mb = 7,
    sentinel = "filtered_gene_bc_matrices/hg19/matrix.mtx"
  ),
  pbmc5k_cite = list(
    subdir = "pbmc5k-citeseq",
    tarball = "5k_pbmc_filtered.tar.gz",
    url = paste0(
      "https://cf.10xgenomics.com/samples/cell-exp/3.1.0/",
      "5k_pbmc_protein_v3_nextgem/",
      "5k_pbmc_protein_v3_nextgem_filtered_feature_bc_matrix.tar.gz"
    ),
    size_mb = 37,
    sentinel = "filtered_feature_bc_matrix/matrix.mtx.gz"
  ),
  pbmc10k_atac = list(
    subdir = "pbmc10k-scatac",
    tarball = NULL,  # these are single files, not tarballs
    files = list(
      h5 = list(
        name = "filtered_peak_bc_matrix.h5",
        url = paste0(
          "https://cf.10xgenomics.com/samples/cell-atac/2.1.0/",
          "10k_pbmc_ATACv2_nextgem_Chromium_Controller/",
          "10k_pbmc_ATACv2_nextgem_Chromium_Controller_",
          "filtered_peak_bc_matrix.h5"
        )
      ),
      meta = list(
        name = "singlecell.csv",
        url = paste0(
          "https://cf.10xgenomics.com/samples/cell-atac/2.1.0/",
          "10k_pbmc_ATACv2_nextgem_Chromium_Controller/",
          "10k_pbmc_ATACv2_nextgem_Chromium_Controller_",
          "singlecell.csv"
        )
      )
    ),
    size_mb = 200,
    sentinel = "filtered_peak_bc_matrix.h5"
  )
)

download_dataset <- function(name, info) {
  .step(paste0("preparing ", name, " (~", info$size_mb, " MB)"))
  target_dir <- file.path(data_root, info$subdir)
  dir.create(target_dir, recursive = TRUE, showWarnings = FALSE)

  # already ready?
  sentinel_path <- file.path(target_dir, info$sentinel)
  if (file.exists(sentinel_path)) {
    .ok(paste0(name, " already present"))
    return(TRUE)
  }

  if (!is.null(info$tarball)) {
    tar_path <- file.path(target_dir, info$tarball)
    if (!file.exists(tar_path)) {
      .step(paste0("downloading ", basename(info$url)))
      tryCatch(
        download.file(info$url, destfile = tar_path, mode = "wb"),
        error = function(e) {
          .warn(paste0("download failed: ", conditionMessage(e)))
        }
      )
    }
    if (file.exists(tar_path)) {
      .step("extracting tarball")
      utils::untar(tar_path, exdir = target_dir)
    }
  } else if (!is.null(info$files)) {
    for (f in info$files) {
      file_path <- file.path(target_dir, f$name)
      if (!file.exists(file_path)) {
        .step(paste0("downloading ", f$name))
        tryCatch(
          download.file(f$url, destfile = file_path, mode = "wb"),
          error = function(e) {
            .warn(paste0("download failed: ", conditionMessage(e)))
          }
        )
      }
    }
  }

  if (file.exists(sentinel_path)) {
    .ok(paste0(name, " ready"))
    TRUE
  } else {
    .warn(paste0(name, " did not complete (check network)"))
    FALSE
  }
}

.print_banner("Downloading 10x Genomics public datasets")
for (n in names(datasets)) {
  download_dataset(n, datasets[[n]])
}

# ---- summary ------------------------------------------------------------
.print_banner("Summary")
cat("Installed packages (examples):\n")
samples <- c("Seurat", "DESeq2", "edgeR", "clusterProfiler",
             "Signac", "scRepertoire", "airway", "fission")
for (p in samples) {
  flag <- if (requireNamespace(p, quietly = TRUE)) "ok" else "MISSING"
  cat(sprintf("  %-20s [%s]\n", p, flag))
}

cat("\nData directory:", data_root, "\n")
for (n in names(datasets)) {
  info <- datasets[[n]]
  sentinel <- file.path(data_root, info$subdir, info$sentinel)
  flag <- if (file.exists(sentinel)) "ok" else "MISSING"
  cat(sprintf("  %-20s [%s]\n", info$subdir, flag))
}

cat("\nDone. Any items flagged MISSING can be retried by rerunning this script.\n")
