05 主线分析：跑通最重要那一条

这是整个专栏最核心的一章。前面 4 篇做的所有准备工作，都是为了让这一章的分析能顺利跑通、结果能站得住。

我们的分析问题是：在 TCGA-LIHC 中构建一个免疫相关基因 prognostic signature，验证其预后预测能力，并关联肿瘤免疫浸润。

主线 6 步，按顺序走。

Step 1：差异分析筛选候选基因

library(DESeq2)
library(dplyr)

se <- readRDS("data/tcga_lihc_rnaseq.rds")
clin <- read.csv("data/tcga_lihc_clinical.csv")

# 只取 tumor（01）和 normal（11）
sample_type <- ifelse(substr(colnames(se), 14, 15) == "01", "Tumor", "Normal")
se$sample_type <- factor(sample_type, levels = c("Normal", "Tumor"))
se_tn <- se[, se$sample_type %in% c("Tumor", "Normal")]

dds <- DESeqDataSet(se_tn, design = ~ sample_type)
keep <- rowSums(counts(dds) >= 10) >= 10
dds <- dds[keep, ]
dds <- DESeq(dds)

res <- results(dds, contrast = c("sample_type", "Tumor", "Normal"))
res_sig <- as.data.frame(res) %>%
  filter(padj < 0.01, abs(log2FoldChange) > 1) %>%
  arrange(padj)

cat("显著 DE 基因:", nrow(res_sig), "\n")

取交集：DE 基因 ∩ 免疫基因

# ImmPort 免疫相关基因列表（约 1800 个）
immune_genes <- readLines("data/immune_gene_list.txt")

# 把 Ensembl ID 转 Symbol
library(org.Hs.eg.db)
res_sig$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res_sig),
                         keytype = "ENSEMBL", column = "SYMBOL")

# 交集
candidates <- res_sig %>% filter(symbol %in% immune_genes)
cat("免疫 DE 候选基因:", nrow(candidates), "\n")
# 通常 100-300 个

Step 2：LASSO Cox 筛选 signature

LASSO Cox 从几百个候选里筛出一组最精简的预后基因组合。

library(glmnet)
library(survival)

# 只用 tumor 样本
tumor_idx <- substr(colnames(se), 14, 15) == "01"
se_tumor <- se[, tumor_idx]

# 准备表达矩阵（VST，只取候选基因）
vsd <- vst(DESeqDataSet(se_tumor, design = ~ 1), blind = TRUE)
expr_mat <- t(assay(vsd)[rownames(candidates), ])

# 对齐临床
patient_ids <- substr(rownames(expr_mat), 1, 12)
clin_matched <- clin[match(patient_ids, clin$patient_id), ]
valid <- !is.na(clin_matched$OS.time) & clin_matched$OS.time > 0
expr_mat <- expr_mat[valid, ]
clin_matched <- clin_matched[valid, ]

# 7:3 split
set.seed(42)
train_idx <- sample(nrow(expr_mat), round(0.7 * nrow(expr_mat)))
x_train <- expr_mat[train_idx, ]
x_test  <- expr_mat[-train_idx, ]
y_train <- Surv(clin_matched$OS.time[train_idx], clin_matched$OS[train_idx])
y_test  <- Surv(clin_matched$OS.time[-train_idx], clin_matched$OS[-train_idx])

# LASSO Cox（10-fold CV 选 lambda）
cv_fit <- cv.glmnet(x_train, y_train, family = "cox",
                    alpha = 1, nfolds = 10)

# 用 lambda.min 提取非零系数基因
coef_min <- coef(cv_fit, s = "lambda.min")
sig_genes <- rownames(coef_min)[coef_min[, 1] != 0]
cat("Signature 基因数:", length(sig_genes), "\n")
# 通常 5-15 个

计算 Risk Score

# Risk Score = Σ(coefficient × expression)
coefs <- coef_min[sig_genes, 1]

risk_train <- as.numeric(x_train[, sig_genes] %*% coefs)
risk_test  <- as.numeric(x_test[, sig_genes] %*% coefs)

# 按 train 的中位数分高低风险
cutoff <- median(risk_train)
group_train <- ifelse(risk_train > cutoff, "High", "Low")
group_test  <- ifelse(risk_test > cutoff, "High", "Low")

Step 3：KM 生存验证

library(survminer)

# Train set KM
surv_train <- data.frame(time = clin_matched$OS.time[train_idx],
                         event = clin_matched$OS[train_idx],
                         group = group_train)
fit_train <- survfit(Surv(time, event) ~ group, data = surv_train)
ggsurvplot(fit_train, data = surv_train, pval = TRUE,
           title = "Training set", risk.table = TRUE)

# Test set KM
surv_test <- data.frame(time = clin_matched$OS.time[-train_idx],
                        event = clin_matched$OS[-train_idx],
                        group = group_test)
fit_test <- survfit(Surv(time, event) ~ group, data = surv_test)
ggsurvplot(fit_test, data = surv_test, pval = TRUE,
           title = "Test set", risk.table = TRUE)

判断标准：两个 KM 曲线都 p < 0.05，且高风险组 HR > 1.5。如果 test set 不显著，说明过拟合，需要回头调 lambda 或减少候选基因。

Step 4：ROC 曲线

library(timeROC)

# 1年、3年、5年 ROC
roc_test <- timeROC(
  T = surv_test$time,
  delta = surv_test$event,
  marker = risk_test,
  cause = 1,
  times = c(365, 1095, 1825),
  iid = TRUE
)

plot(roc_test, time = 365, title = FALSE)
legend("bottomright",
       paste0(c("1-year", "3-year", "5-year"), " AUC = ",
              round(roc_test$AUC, 3)))

AUC > 0.65 可接受，> 0.7 良好，> 0.75 优秀。

Step 5：CIBERSORT 免疫浸润关联

# CIBERSORT 需要 TPM 矩阵（行=基因 symbol，列=样本）
# 这里用 TIMER2.0 预计算结果或 immunedeconv 包
library(immunedeconv)

# 用 TIMER 方法估计 6 种免疫细胞
immune_fractions <- deconvolute(expr_tpm, method = "timer",
                                indications = rep("lihc", ncol(expr_tpm)))

# 把 risk score 和免疫细胞比例做相关
cor_results <- sapply(rownames(immune_fractions), function(cell) {
  cor.test(risk_score_all, as.numeric(immune_fractions[cell, ]),
           method = "spearman")
})

预期结果：高风险组的 CD8 T 细胞浸润低、Macrophage M2 高 — 这是"免疫冷肿瘤"的典型特征，和预后差一致。

Step 6：多变量 Cox 确认独立性

# Signature 是否独立于分期、年龄、性别
multi_cox <- coxph(
  Surv(OS.time, OS) ~ risk_group + stage + age + gender,
  data = clin_full
)
summary(multi_cox)
# risk_group 的 p < 0.05 说明 signature 独立于临床变量

主线跑完后你手上有什么

产物	用途
N 个基因的 signature + 系数	论文核心结果
Train/Test KM 曲线	Figure 2
1/3/5 年 ROC	Figure 3
免疫浸润相关图	Figure 4
多变量 Cox forest plot	Figure 5
DE 基因火山图	Figure 1