netMHCpan 작업 #2 HLA-I 펩타이드 추출 #

#2025-07-23

#

#1 Patient id 추출

#data

data/
├── c315
│   └── allprot.fasta
└── c442
    └── allprot.fasta

#patients.bash

#!/bin/bash

# FASTA에서 patient ID 추출하여 patient_id.txt로 저장

ALLPROT_PATH="data/c315/allprot.fasta"
OUT_FILE="data/patient_id.txt"

# 스크립트가 있는 디렉터리로 이동
cd "$(dirname "$0")"

# patient_id.txt 파일 초기화
> "$OUT_FILE"

# FASTA 파일에서 ID 추출
grep "^>" "$ALLPROT_PATH" | cut -d'|' -f1 | sed 's/^>//' >> "$OUT_FILE"

#result

data/
├── c315
│   └── allprot.fasta
├── c442
│   └── allprot.fasta
└── patient_id.txt

#

#2 환자별 proteome.fasta와 HLA-I 펩타이드 생성

#data

data/
├── c315
│   └── allprot.fasta
├── c442
│   └── allprot.fasta
└── patient_id.txt

#epitope.bash

#!/bin/bash

# 입력: 클러스터 이름 (c315, c442)
# 출력: 환자별 proteome.fasta와 peptides_HLA-I.csv (8~14mer)

CLUSTER=$1
ALLPROT_PATH="data/${CLUSTER}/allprot.fasta"
OUT_DIR="data/${CLUSTER}"

# 스크립트가 있는 디렉터리로 이동
cd "$(dirname "$0")"

# 출력 디렉터리 생성
mkdir -p "$OUT_DIR"

# FASTA 파일 읽으며 처리
while read -r line; do
    if [[ $line == ">"* ]]; then
        PATIENT_ID=$(echo "$line" | cut -d'|' -f1 | tr -d '>')
        PATIENT_DIR="$OUT_DIR/$PATIENT_ID"
        mkdir -p "$PATIENT_DIR"
        
        # 다음 줄에서 단백질 서열 읽기
        read -r sequence
        
        # proteome.fasta 생성
        {
            echo "$line"
            echo "$sequence"
        } > "$PATIENT_DIR/proteome.fasta"
        
        # HLA-I 펩타이드 생성 (8~14mer)
        python3 sc1.py "$PATIENT_DIR/proteome.fasta" "8,9,10,11,12,13,14" \
            > "$PATIENT_DIR/peptides_HLA-I.csv"
    fi
done < "$ALLPROT_PATH"

#sc1.py

import sys

# Input:
# 1) Input file (eg., $OUT_DIR/${PATIENT_ID}_proteome.fasta)
# 2) Peptide lengths
# Output:
# 1) CSV table with HLA-I peptides (8-14 aminoacids, {PATIENT_ID}_peptides.csv)

# Path to FASTA file with viral proteome
proteome_path = sys.argv[1]

# Comma-separated list of peptide lengths
peptide_lengths = list(map(int, sys.argv[2].split(",")))

# Output: comma-separated table of the following form:
# Peptide, protein name, start coordinate, end coordinate
# Coordinate system is 0-based

proteome_file = open(proteome_path)
print("Peptide,Patient,Start,End")
for line in proteome_file:
    if line.startswith(">"):
        protein_name = line.split("|")[0].lstrip(">")
    else:
        protein = line.strip().rstrip("*")
        for k in peptide_lengths:
            for start in range(len(protein) - k + 1):
                print("{},{},{},{}".format(
                    protein[start:start + k],
                    protein_name,
                    start, start + k
                ))

#

#result

data/
├── c315
│   ├── allprot.fasta
│   └── * (*: patient id)
│       ├── proteome.fasta
│       └── peptides_HLA-I.csv
├── c442
│   └── (c315와 동일한 구조로 생성됨)
└── patient_id.txt