MAFFT #2 MAFFT 실행 #
#2025-07-28
1. Objective #
Influenza의 Reference squence는 길이가 fix되어있지만,
- 각 sequence는 삽입/탈락 mutation이 일어남에 따라 모두 길이가 같지 않다.
- 이 길이를 맞춰주는 padding을 하기 위해 MAFFT를 이용해 정렬(Multiple Sequence Alignment)한다.
#
2. MAFFT 실행 bash script #
#data
/Influenza
└── Preprocessed/
├── HA/
│ ├── A-H1N1.fasta
│ ├── A-H1N1.fasta
│ ├── ...
│ └── B.fasta
└── ...
└── (HA와 동일 구조)
└── MAFFT/
└── (empty)
#!/bin/bash
# List of genes and files to process
gene_list=('HA' 'NA' 'MP' 'PA' 'NS' 'PB1' 'PB2' 'NP')
file_list=('A-H1N1' 'A-H3N2' 'A-H5N1' 'B')
# Loop through each gene and file
for gene in "${gene_list[@]}"; do
echo "Processing gene: $gene"
for file in "${file_list[@]}"; do
echo "Processing file: $file"
# Define directories and file paths
input_fasta="/data3/projects/2020_MUTCLUST/Data/Projects/GISAID_revision/Influenza/Preprocessed/${gene}/${file}.fasta"
reference_fasta="/data3/projects/2020_MUTCLUST/Data/Projects/GISAID_revision/Influenza/Reference/Preprocessed/${gene}/${file}.fasta"
output_dir="/data3/projects/2020_MUTCLUST/Data/Projects/GISAID_revision/Influenza/MAFFT/${gene}"
output_fasta="${output_dir}/${file}.fasta"
# Create output directory if it doesn't exist
mkdir -p "$output_dir"
# Perform multiple sequence alignment with MAFFT
echo "Running MAFFT for ${gene}/${file}..."
mafft --thread 50 --anysymbol --add "$input_fasta" --keeplength "$reference_fasta" > "$output_fasta"
# Check if MAFFT was successful
if [ $? -eq 0 ]; then
echo "Alignment complete: $output_fasta"
else
echo "MAFFT alignment failed for ${gene}/${file}"
fi
done
done
#result
/Influenza
└── Preprocessed/
├── HA/
│ ├── A-H1N1.fasta
│ ├── A-H1N1.fasta
│ ├── ...
│ └── B.fasta
└── ...
└── (HA와 동일 구조)
└── MAFFT/
├── HA/
│ ├── A-H1N1.fasta
│ ├── A-H1N1.fasta
│ ├── ...
│ └── B.fasta
└── ...
└── (HA와 동일 구조)