EBV RNA-seq 전처리 #
#2024-12-31
0 #
분석 목적
- 제공받은 fastq를 human genome에 매핑해서 전처리, 분석 후 DE 결과 보냄
- DE 분석시에 EBV 유전자도 포함해달라는 요청
해야하는것
- fastq를 EBV genome에 매핑해서 전처리, EBV count 생성
- human count에 EBV count를 붙이기
- 통합 count로 DE 분석 재수행
#
1. Alignment #
Load package, Set Path
library(edgeR)
library(Rsubread)
library(org.Hs.eg.db)
setwd("/data/home/ysh980101/2311/RNA-seq_ebv/Rsubread")
getwd()
'/data1/home/ysh980101/2311/RNA-seq_ebv/Rsubread'
Build Index
# build index
ref <- "/data3/PUBLIC_DATA/ref_genomes/Human_gammaherpesvirus_4_EBV/NC_007605.1.fa"
output_basename <- "NC_007605.1_idx"
buildindex(basename = output_basename, reference = ref)
Feature Count
# feature.count
targets <- read.delim("targets.txt", header=TRUE)
output.files <- c(targets$FileName)
fc <- featureCounts(output.files, annot.inbuilt=FALSE, annot.ext = "/data3/PUBLIC_DATA/ref_genomes/Human_gammaherpesvirus_4_EBV/NC_007605.1.gtf", isGTFAnnotationFile = TRUE,
GTF.featureType = "exon", GTF.attrType = "transcript_id", GTF.attrType.extra = NULL, isPairedEnd=TRUE, countReadPairs=TRUE)# nthreads=30)
Save
colnames(fc$counts) <- c('33-1','33-2','33-3','150-1','150-2','150-3','con-1','con-2','con-3')
head(fc$counts)
group <- factor(targets$Status)
y <- DGEList(fc$counts, group=group)
fc$sort.counts <- fc$counts[order(rownames(fc$counts)), ]
sort.counts.df <- as.data.frame(fc$sort.counts)
sort.counts.df$trans_id <- rownames(fc$sort.counts)
sort.counts.df <- sort.counts.df[, c("trans_id", "33-1", "33-2", "33-3", "150-1", "150-2", "150-3", "con-1", "con-2", "con-3")]
write.table(sort.counts.df, file = "count.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
#
2. Annotation #
Load Package
import os
import pandas as pd
Set Path
os.chdir("/data/home/ysh980101/2311/RNA-seq_ebv/Rsubread")
os.getcwd()
'/data1/home/ysh980101/2311/RNA-seq_ebv/Rsubread'
Load GTF file
- annotation 파일이 없어서 gtf reference 파일을 사용해서 직접 만들어줫다.
gtf_df = pd.read_csv("/data3/PUBLIC_DATA/ref_genomes/Human_gammaherpesvirus_4_EBV/NC_007605.1.gtf", sep='\t', header=None, comment="#", names=["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"])
gtf_df
Make annotation file
split_attributes = gtf_df['attribute'].str.split(';')
attribute_1 = []
attribute_2 = []
attribute_3 = []
for attributes in split_attributes:
attr_1 = ''
attr_2 = ''
attr_3 = ''
for attribute in attributes:
attribute = attribute.strip()
if 'transcript_id' in attribute:
attr_1 = attribute
elif 'gene_id' in attribute:
attr_2 = attribute
elif 'gene_name' in attribute:
attr_3 = attribute
attribute_1.append(attr_1)
attribute_2.append(attr_2)
attribute_3.append(attr_3)
gtf_df['attribute.1'] = attribute_1
gtf_df['attribute.2'] = attribute_2
gtf_df['attribute.3'] = attribute_3
annot_df = gtf_df.iloc[:, -4:].copy()
annot_df['transcript_id'] = annot_df['attribute.1'].str.split('"').str[1]
annot_df['gene_id'] = annot_df['attribute.2'].str.split('"').str[1]
annot_df['gene_name'] = annot_df['attribute.3'].str.split('"').str[1]
annot_df
annot_table = annot_df.iloc[:, -3:].copy()
annot_table_dedup = annot_table[~annot_table['transcript_id'].duplicated(keep='first')]
annot = annot_table_dedup.iloc[:, -3:].copy()
annot = annot.sort_values(by='transcript_id')
annot.head(50)
Save
#annot.to_csv("annot.tsv",sep='\t',index=False)
- 나중에 또 써야대니깐 저장.
Load Count
count = pd.read_csv("count.tsv",sep='\t',encoding='cp949')
count
- 위에서 Rsubread로 만든 EBV count 가져옴
Annotation
merged = count.merge(annot, left_on='trans_id', right_on='transcript_id', how='left')
res_df = merged.drop(columns=['trans_id'])
res_df = res_df[['transcript_id', 'gene_id', 'gene_name', '33-1', '33-2', '33-3', '150-1', '150-2', '150-3', 'con-1', 'con-2', 'con-3']]
res_df
res_df['gene_name'].value_counts()
- 중복인 transcript가 엄청많음
res_df['sum'] = res_df.iloc[:, 3:12].sum(axis=1)
res_df = res_df.sort_values(by='sum', ascending=False)
res_df = res_df.drop_duplicates(subset='gene_name', keep='first')
res_df = res_df.iloc[:, :-1]
res_df.reset_index(drop=True, inplace=True)
res_df = res_df.dropna(subset=['gene_name'])
print(res_df.shape)
res_df
- (63, 12)짜리에서 deduplication 결과 (15, 12)가 됏다.
res_df.to_csv("count.annot.tsv", sep='\t',index=False)
Merge
count_hs = pd.read_csv("/data/home/ysh980101/2311/RNA-seq/count.csv", encoding='cp949')
count_hs.columns = ["transcript_id", "gene_id", "gene_name", "33-1", "33-2", "33-3", "150-1", "150-2", "150-3", "con-1", "con-2", "con-3"]
combined_df = pd.concat([res_df, count_hs], axis=0, ignore_index=True)
combined_df
combined_drop_df = combined_df[~(combined_df.iloc[:, 4:13] == 0).all(axis=1)] #remove 0 in all samples
combined_drop_df
combined_drop_df.to_csv("count.annot.combined.drop.tsv",sep='\t',index=False)
#
3. DEG Analysis #
Load Packages
suppressMessages({
library("DESeq2")
library(pheatmap)
library(withr)
library(RColorBrewer)
library(gplots)
library(ggplot2)
library(dplyr)
})
Set Path
setwd("/data/home/ysh980101/2311/RNA-seq_ebv/Rsubread")
getwd()
'/data1/home/ysh980101/2311/RNA-seq_ebv/Rsubread'
Run DEG
S1 <- '33'
S2 <- '150'
countdata <- read.csv("count.annot.combined.drop.tsv", header=TRUE, sep='\t')
colnames(countdata) <- c('transcript_id','gene_id','gene_name','33-1','33-2','33-3','150-1','150-2','150-3','con-1','con-2','con-3')
countdata <- countdata[, c('transcript_id','gene_id','gene_name','150-1','150-2','150-3','33-1','33-2','33-3','con-1','con-2','con-3')]
countdata <- countdata[, paste(c('gene_name',paste0(S2,"-1"), paste0(S2,"-2"), paste0(S2,"-3"),paste0(S1,"-1"), paste0(S1,"-2"), paste0(S1,"-3")), sep="")]
dim(countdata)
16409 * 7
sample.names <- paste(c(paste0(S2,"-1"), paste0(S2,"-2"), paste0(S2,"-3"),paste0(S1,"-1"), paste0(S1,"-2"), paste0(S1,"-3")), sep="")
conditions <- factor(c(S2,S2,S2,S1,S1,S1))
metadata <- data.frame(Sample = sample.names, group = conditions)
N <- dim(countdata)[[2]]
cData = countdata[,2:N]
GeneID = countdata[,1]
rownames(cData) = GeneID
dds <- DESeqDataSetFromMatrix(countData = cData, colData = metadata, design = ~group)
dds$group <- relevel(dds$group, ref = S1)
dds <- DESeq(dds)
vsd <- vst(dds, blind = FALSE)
rld <- rlogTransformation(dds, blind = FALSE)
res <- results(dds, contrast = c("group", S2, S1))
res_tbl <- as.data.frame(res)
res_tbl$GeneID <- rownames(res_tbl)
res_tbl <- res_tbl[order(res_tbl$padj), ]
NM_no_NA <- na.omit(res)
res_cut <- NM_no_NA[NM_no_NA$padj < 0.05, ]
lfc <- 1.5
res_cut_2 <- res_cut[abs(res_cut$log2FoldChange) > lfc, ]
res_cut_2_pos <- res_cut_2[res_cut_2$log2FoldChange > 0, ]
res_cut_2_neg <- res_cut_2[res_cut_2$log2FoldChange < 0, ]
# Volcano Plot
par(mfrow = c(1, 1))
with(NM_no_NA, plot(log2FoldChange, -log10(pvalue), pch = 20, col = 'gray',
main = paste0('Volcano plot_between\n', S2, ' vs ', S1),
xlim = c(-3, 3)))
abline(h = -log10(0.05), v = c(-lfc, lfc), col = "gray", lty = 2)
with(res_cut_2_neg, points(log2FoldChange, -log10(pvalue), pch = 20, col = 'blue'))
with(res_cut_2_pos, points(log2FoldChange, -log10(pvalue), pch = 20, col = 'gold'))
legend("topright", legend = c(paste0("FC ≥ ", lfc, " & p < 0.05"),
paste0("FC ≤ -", lfc, " & p < 0.05")),
col = c("gold", "blue"), pch = 20)
# Bar Plot
lengths <- c(nrow(res_cut_2_neg), nrow(res_cut_2_pos))
bp <- barplot(lengths, space = 0, col = c("blue", "gold"),
horiz = TRUE, xlab = "Count of genes", ylab = paste0(S2, "/", S1),
xlim = c(0, sum(lengths)), width = 0.02, las = 1)
title(paste0("UP, DOWN regulated count (|FC| ≥ ", lfc, ")"))
legend("topright", legend = c("UP", "DOWN"), fill = c("gold", "blue"))
text(lengths, bp, labels = lengths, pos = 4, cex = 0.9)
# Save
cutoff <- 0.05
sig_res <- dplyr::filter(res_tbl, padj < cutoff) %>%
dplyr::arrange(padj) %>%
dplyr::select(GeneID, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj)
write.csv(sig_res, file = paste0('../DESeq2/res.', S2, 'vs', S1, '.csv'), row.names = FALSE)
Heatmap 시각화
# Heatmap
sig_idx <- res$padj < cutoff & !is.na(res$padj)
sig_dat <- assay(rld)[sig_idx, ]
annC <- data.frame(condition = conditions)
rownames(annC) <- colnames(sig_dat)
ann_colors <- list(condition = setNames(c("#F7819F", "#58D3F7"), c(S2, S1)))
pheatmap(sig_dat,
scale = "row",
fontsize_row = 9,
annotation_col = annC,
color = rev(brewer.pal(6, "RdYlGn")),
annotation_colors = ann_colors,
show_rownames = FALSE,
show_colnames = TRUE)