Rsubread, edgeR: RNA-seq 전처리

Rsubread, edgeR: RNA-seq 전처리 #

#2025-04-21


가장 오류 적게나는 조합!

1. Align RNA-seq #

Load Packages

library(Rsubread)
library(org.Mm.eg.db)
library(gridExtra)
library(reshape2)

Set Path

indir = "/data/home/ysh980101/2504/mirna/data"
outdir = "/data/home/ysh980101/2504/mirna/data"
refpath = "/data/home/ysh980101/2406/data-gne/mm39.fa"

setwd(indir)
getwd()
'/data/home/ysh980101/2504/mirna/data'

Build Index

buildindex(basename = "mm39", reference = refpath)

Read Alignment

files <- list.files(pattern="\\.fastq\\.gz$", full.names=TRUE)
bams <- sub("\\.fastq\\.gz$", ".bam", files)
samples <- gsub("^\\.\\/|\\.fastq\\.gz$", "", files)
targets <- read.delim("target.txt", header=TRUE)

align(index="mm39", readfile1=files, input_format="gzFASTQ", output_file=bams, nthreads=50)

Quantification

fc = featureCounts(bams, isGTFAnnotationFile=TRUE, GTF.featureType="exon", GTF.attrType="gene_id", isPairedEnd=FALSE, annot.ext="mm39.knownGene.gtf", useMetaFeatures=FALSE, allowMultiOverlap=TRUE, nthreads=50)

Save Countdata

colnames(fc$counts) <- samples
y <- DGEList(fc$counts, group=group)
write.csv(as.data.frame(y$counts), file = paste0(outdir,"/count.csv", row.names = TRUE))

#

2. Gene ID Annotation #

Load Packages

import pandas as pd
import numpy as np
import os

Set Path

indir = "/data/home/ysh980101/2504/mirna/data"
outdir = "/data/home/ysh980101/2504/mirna/data"
annotpath = "/data/home/ysh980101/2406/data-gne/MRK_ENSEMBL.rpt"

os.chdir(indir)
os.getcwd()
'/data1/home/ysh980101/2504/mirna/result'

Load Annotation

annotation = pd.read_csv(annotpath, sep="\t", names=[str(i) for i in range(13)])
annotation = annotation.dropna(subset=['6'])
annotation = annotation[annotation['8'] == 'protein coding gene']

Load Count & Gene ID Mapping

count_mm39 = pd.read_csv("count.csv")
count_mm39.rename(columns={count_mm39.columns[0]: 'ens_id'}, inplace=True)
count_mm39['ens_id'] = count_mm39['ens_id'].str.split('.').str[0]

for index, row in annotation.iterrows():
    ens_ids = row['6'].split()
    gene_id = row['1']
    count_mm39.loc[count_mm39['ens_id'].isin(ens_ids), 'gene_id'] = gene_id

Transcript Filtering

count_mm39['sum'] = count_mm39.iloc[:, 2:].sum(axis=1)
count_mm39 = count_mm39.sort_values(by=['gene_id', 'sum'], ascending=[True, False])
count_mm39 = count_mm39.drop_duplicates(subset=['gene_id'], keep='first')
count_mm39 = count_mm39.dropna(subset=['gene_id'])

count_mm39 = count_mm39.drop(columns=['sum', 'ens_id'])
gene_id_column = count_mm39['gene_id']
count_mm39.drop(columns=['gene_id'], inplace=True)
count_mm39.insert(0, 'gene_id', gene_id_column)

Save

count_mm39.rename(columns={'gene_id': 'GeneID'}, inplace=True)

def rename_columns(col):
    parts = col.split('_')
    if len(parts) >= 3:
        new_col = parts[0] + parts[2] + '_' + parts[1]
    else:
        new_col = col
    return new_col

count_mm39.columns = [rename_columns(col) for col in count_mm39.columns]
count_mm39.to_csv(f"{outdir}/count_processed.csv", index=False)

#

3. DEG Analysis #

Library & Set Path

library(edgeR)

indir = "/data/home/ysh980101/2504/mirna/data"
outdir = "/data/home/ysh980101/2504/mirna/result"

setwd(indir)
getwd()
'/data1/home/ysh980101/2504/mirna/data'

Set variables & Load Data

tissue <- "G"
S1 <- "WT" 
S2 <- "GneKI"

counts <- read.csv("count_processed.csv")
meta <- read.csv(paste0("mouse_meta_",tissue,".csv"))
meta <- meta[meta$Group %in% c(S1, S2), ]

counts <- counts[, c("GeneID", unique(meta$SampleID))]

Create DGElist & Normalization

Group <- factor(meta$Group)
Group <- relevel(Group, ref=S1)

y <- DGEList(counts=counts[,2:ncol(counts)], group=Group, genes = counts[,1])
y <- calcNormFactors(y)

Run DEG

design <- model.matrix(~Group)
y <- estimateDisp(y, design)
y <- estimateGLMRobustDisp(y,design)

fit <- glmFit(y, design)
lrt <- glmLRT(fit)

plotMD(lrt)
abline(h=c(-1,1), col="blue")

image

Save

result_table <- topTags(lrt, n = nrow(lrt$table))
sorted_result_table <- result_table[order(result_table$table$FDR), ]
filtered_result_table <- sorted_result_table[sorted_result_table$table$FDR < 0.05, ]

write.csv(sorted_result_table, file = paste0(outdir,"/de-",tissue,"_",S1,"-",S2,".csv"))

#