EBV RNA-seq 전처리

EBV RNA-seq 전처리 #

#2024-12-31


0 #

분석 목적

  • 제공받은 fastq를 human genome에 매핑해서 전처리, 분석 후 DE 결과 보냄
  • DE 분석시에 EBV 유전자도 포함해달라는 요청

해야하는것

  • fastq를 EBV genome에 매핑해서 전처리, EBV count 생성
  • human count에 EBV count를 붙이기
  • 통합 count로 DE 분석 재수행

#

1. Alignment #

Load package, Set Path

library(edgeR)
library(Rsubread)
library(org.Hs.eg.db)

setwd("/data/home/ysh980101/2311/RNA-seq_ebv/Rsubread")
getwd()
'/data1/home/ysh980101/2311/RNA-seq_ebv/Rsubread'

Build Index

# build index
ref <- "/data3/PUBLIC_DATA/ref_genomes/Human_gammaherpesvirus_4_EBV/NC_007605.1.fa"
output_basename <- "NC_007605.1_idx"
buildindex(basename = output_basename, reference = ref)

Feature Count

# feature.count
targets <- read.delim("targets.txt", header=TRUE)
output.files <- c(targets$FileName)

fc <- featureCounts(output.files, annot.inbuilt=FALSE, annot.ext = "/data3/PUBLIC_DATA/ref_genomes/Human_gammaherpesvirus_4_EBV/NC_007605.1.gtf", isGTFAnnotationFile = TRUE, 
                    GTF.featureType = "exon", GTF.attrType = "transcript_id", GTF.attrType.extra = NULL, isPairedEnd=TRUE, countReadPairs=TRUE)# nthreads=30)

Save

colnames(fc$counts) <- c('33-1','33-2','33-3','150-1','150-2','150-3','con-1','con-2','con-3')
head(fc$counts)

group <- factor(targets$Status)
y <- DGEList(fc$counts, group=group)
fc$sort.counts <- fc$counts[order(rownames(fc$counts)), ]

sort.counts.df <- as.data.frame(fc$sort.counts)
sort.counts.df$trans_id <- rownames(fc$sort.counts)
sort.counts.df <- sort.counts.df[, c("trans_id", "33-1", "33-2", "33-3", "150-1", "150-2", "150-3", "con-1", "con-2", "con-3")]

write.table(sort.counts.df, file = "count.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

#

2. Annotation #

Load Package

import os
import pandas as pd

Set Path

os.chdir("/data/home/ysh980101/2311/RNA-seq_ebv/Rsubread")
os.getcwd()
'/data1/home/ysh980101/2311/RNA-seq_ebv/Rsubread'

Load GTF file

  • annotation 파일이 없어서 gtf reference 파일을 사용해서 직접 만들어줫다.
gtf_df = pd.read_csv("/data3/PUBLIC_DATA/ref_genomes/Human_gammaherpesvirus_4_EBV/NC_007605.1.gtf", sep='\t', header=None, comment="#", names=["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"])
gtf_df

image

Make annotation file

split_attributes = gtf_df['attribute'].str.split(';')

attribute_1 = []
attribute_2 = []
attribute_3 = []

for attributes in split_attributes:
    attr_1 = ''
    attr_2 = ''
    attr_3 = ''

    for attribute in attributes:
        attribute = attribute.strip()
        if 'transcript_id' in attribute:
            attr_1 = attribute
        elif 'gene_id' in attribute:
            attr_2 = attribute
        elif 'gene_name' in attribute:
            attr_3 = attribute

    attribute_1.append(attr_1)
    attribute_2.append(attr_2)
    attribute_3.append(attr_3)

gtf_df['attribute.1'] = attribute_1
gtf_df['attribute.2'] = attribute_2
gtf_df['attribute.3'] = attribute_3

annot_df = gtf_df.iloc[:, -4:].copy()
annot_df['transcript_id'] = annot_df['attribute.1'].str.split('"').str[1]
annot_df['gene_id'] = annot_df['attribute.2'].str.split('"').str[1]
annot_df['gene_name'] = annot_df['attribute.3'].str.split('"').str[1]

annot_df

image

annot_table = annot_df.iloc[:, -3:].copy()
annot_table_dedup = annot_table[~annot_table['transcript_id'].duplicated(keep='first')]
annot = annot_table_dedup.iloc[:, -3:].copy()
annot = annot.sort_values(by='transcript_id')
annot.head(50)

image

Save

#annot.to_csv("annot.tsv",sep='\t',index=False)
  • 나중에 또 써야대니깐 저장.

Load Count

count = pd.read_csv("count.tsv",sep='\t',encoding='cp949')
count

image

  • 위에서 Rsubread로 만든 EBV count 가져옴

Annotation

merged = count.merge(annot, left_on='trans_id', right_on='transcript_id', how='left')
res_df = merged.drop(columns=['trans_id'])
res_df = res_df[['transcript_id', 'gene_id', 'gene_name', '33-1', '33-2', '33-3', '150-1', '150-2', '150-3', 'con-1', 'con-2', 'con-3']]
res_df

image

res_df['gene_name'].value_counts()

image

  • 중복인 transcript가 엄청많음
res_df['sum'] = res_df.iloc[:, 3:12].sum(axis=1)
res_df = res_df.sort_values(by='sum', ascending=False)
res_df = res_df.drop_duplicates(subset='gene_name', keep='first')

res_df = res_df.iloc[:, :-1]
res_df.reset_index(drop=True, inplace=True)
res_df = res_df.dropna(subset=['gene_name'])
print(res_df.shape)
res_df

image

  • (63, 12)짜리에서 deduplication 결과 (15, 12)가 됏다.
res_df.to_csv("count.annot.tsv", sep='\t',index=False)

Merge

count_hs = pd.read_csv("/data/home/ysh980101/2311/RNA-seq/count.csv", encoding='cp949')
count_hs.columns = ["transcript_id", "gene_id", "gene_name", "33-1", "33-2", "33-3", "150-1", "150-2", "150-3", "con-1", "con-2", "con-3"]

combined_df = pd.concat([res_df, count_hs], axis=0, ignore_index=True)
combined_df
combined_drop_df = combined_df[~(combined_df.iloc[:, 4:13] == 0).all(axis=1)] #remove 0 in all samples 
combined_drop_df

image

image

combined_drop_df.to_csv("count.annot.combined.drop.tsv",sep='\t',index=False)

#

3. DEG Analysis #

Load Packages

suppressMessages({
    library("DESeq2")
    library(pheatmap)
    library(withr)
    library(RColorBrewer)
    library(gplots)
    library(ggplot2)
    library(dplyr)
    })

Set Path

setwd("/data/home/ysh980101/2311/RNA-seq_ebv/Rsubread")
getwd()
'/data1/home/ysh980101/2311/RNA-seq_ebv/Rsubread'

Run DEG

S1 <- '33'
S2 <- '150'

countdata <- read.csv("count.annot.combined.drop.tsv", header=TRUE, sep='\t')
colnames(countdata) <- c('transcript_id','gene_id','gene_name','33-1','33-2','33-3','150-1','150-2','150-3','con-1','con-2','con-3')
countdata <- countdata[, c('transcript_id','gene_id','gene_name','150-1','150-2','150-3','33-1','33-2','33-3','con-1','con-2','con-3')]
countdata <- countdata[, paste(c('gene_name',paste0(S2,"-1"), paste0(S2,"-2"), paste0(S2,"-3"),paste0(S1,"-1"), paste0(S1,"-2"), paste0(S1,"-3")), sep="")]
dim(countdata)
16409 * 7
sample.names <- paste(c(paste0(S2,"-1"), paste0(S2,"-2"), paste0(S2,"-3"),paste0(S1,"-1"), paste0(S1,"-2"), paste0(S1,"-3")), sep="")
conditions <- factor(c(S2,S2,S2,S1,S1,S1))
metadata <- data.frame(Sample = sample.names, group = conditions)

N <- dim(countdata)[[2]]
cData = countdata[,2:N]
GeneID = countdata[,1]
rownames(cData) = GeneID

dds <- DESeqDataSetFromMatrix(countData = cData, colData = metadata, design = ~group)
dds$group <- relevel(dds$group, ref = S1)
dds <- DESeq(dds)  
vsd <- vst(dds, blind = FALSE)
rld <- rlogTransformation(dds, blind = FALSE)

res <- results(dds, contrast = c("group", S2, S1))
res_tbl <- as.data.frame(res)
res_tbl$GeneID <- rownames(res_tbl)
res_tbl <- res_tbl[order(res_tbl$padj), ]

NM_no_NA <- na.omit(res)
res_cut <- NM_no_NA[NM_no_NA$padj < 0.05, ]

lfc <- 1.5
res_cut_2 <- res_cut[abs(res_cut$log2FoldChange) > lfc, ]
res_cut_2_pos <- res_cut_2[res_cut_2$log2FoldChange > 0, ]
res_cut_2_neg <- res_cut_2[res_cut_2$log2FoldChange < 0, ]
# Volcano Plot
par(mfrow = c(1, 1))
with(NM_no_NA, plot(log2FoldChange, -log10(pvalue), pch = 20, col = 'gray',
                    main = paste0('Volcano plot_between\n', S2, ' vs ', S1),
                    xlim = c(-3, 3)))
abline(h = -log10(0.05), v = c(-lfc, lfc), col = "gray", lty = 2)
with(res_cut_2_neg, points(log2FoldChange, -log10(pvalue), pch = 20, col = 'blue'))
with(res_cut_2_pos, points(log2FoldChange, -log10(pvalue), pch = 20, col = 'gold'))
legend("topright", legend = c(paste0("FC ≥ ", lfc, " & p < 0.05"),
                              paste0("FC ≤ -", lfc, " & p < 0.05")),
       col = c("gold", "blue"), pch = 20)

# Bar Plot
lengths <- c(nrow(res_cut_2_neg), nrow(res_cut_2_pos))
bp <- barplot(lengths, space = 0, col = c("blue", "gold"),
              horiz = TRUE, xlab = "Count of genes", ylab = paste0(S2, "/", S1),
              xlim = c(0, sum(lengths)), width = 0.02, las = 1)
title(paste0("UP, DOWN regulated count (|FC| ≥ ", lfc, ")"))
legend("topright", legend = c("UP", "DOWN"), fill = c("gold", "blue"))
text(lengths, bp, labels = lengths, pos = 4, cex = 0.9)

# Save
cutoff <- 0.05
sig_res <- dplyr::filter(res_tbl, padj < cutoff) %>%
  dplyr::arrange(padj) %>%
  dplyr::select(GeneID, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj)

write.csv(sig_res, file = paste0('../DESeq2/res.', S2, 'vs', S1, '.csv'), row.names = FALSE)

image

image

Heatmap 시각화

# Heatmap
sig_idx <- res$padj < cutoff & !is.na(res$padj)
sig_dat <- assay(rld)[sig_idx, ]

annC <- data.frame(condition = conditions)
rownames(annC) <- colnames(sig_dat)
ann_colors <- list(condition = setNames(c("#F7819F", "#58D3F7"), c(S2, S1)))

pheatmap(sig_dat,
         scale = "row",
         fontsize_row = 9,
         annotation_col = annC,
         color = rev(brewer.pal(6, "RdYlGn")),
         annotation_colors = ann_colors,
         show_rownames = FALSE,
         show_colnames = TRUE)

image