#3 밀도 기반 클러스터링 #
#2025-06-20
1. Load package #
import pandas as pd
import numpy as np
import os
os.sys.path.append("/data/home/ysh980101/2407/Mutclust")
from pathlib import Path
from Bin.Utils.utils import *
from Bin.arg_parser import *
from Bin.mlib import *
2. Find CCMs #
i = 1
tag = f"test{i}"
input_path = "/data/home/ysh980101/2407/Mutclust/Testdata/Input/GISAID_total.pickle"
outdir = f"/data/home/ysh980101/2407/Mutclust/Testdata/Output/GISAID_{tag}/"
Path(outdir).mkdir(parents=True, exist_ok=True)
info = set_env(input = input_path, output = outdir)
Input_df = readPickle(input_path)
init(Input_df, info)
mutInfo, ccms = get_candidate_core_mutations(Input_df, info, tag, i)
--- Configurations ---
Input data: '/data/home/ysh980101/2407/Mutclust/Testdata/Input/GISAID_total.pickle' (29903, 5)
Output dir: '/data/home/ysh980101/2407/Mutclust/Testdata/Output/GISAID_test1/'
Parameters:
Min Eps=5
Max Eps=1000
Min per_sum=0.0
Eps scaling factor=10.0
Expansion diminishing factor=3
Min cluster length=10
----------------------
Searching candidate core mutations...
1990 CCMs found.
sample_ccm = ccms[0]
mutInfo[sample_ccm]
{'index': 11,
'Position': 277,
'Frequency': 86,
'Percentage': 0.00038338430264178534,
'Entropy': 0.6078847228873923,
'H-score': 0.03323669788067187,
'length': 12,
'freq_sum': 1476,
'freq_avr': 123.0,
'per_sum': 0.0065799445430148274,
'per_avr': 0.0005483287119179023,
'ent_sum': 6.254087818941727,
'ent_avr': 0.5211739849118106,
'H-score_sum': 0.15877807556629392,
'H-score_avr': 0.01323150629719116,
'eps_scaler': 1,
'left_distance': 5,
'right_distance': 5,
'l_pos': 272,
'r_pos': 282,
'mut_n': 11}
3. Perform clustering #
hotspots = dynaclust(mutInfo, ccms, info, tag, i)
Performing dynamic clustering...
1990 clusters found
Merging clusters...
Merged clusters: 477
print(hotspots)
left_position right_position length \
0 272 290 19
1 332 347 16
2 358 392 35
3 433 448 16
4 482 495 14
.. ... ... ...
472 29568 29577 10
473 29581 29599 19
474 29613 29633 21
475 29640 29651 12
476 29654 29671 18
mut_positions
0 272,273,274,275,277,278,279,280,281,282,283,28...
1 332,334,335,336,337,338,341,343,344,345,346,347
2 358,360,361,362,363,364,365,366,367,368,369,37...
3 433,435,436,437,438,439,440,441,442,443,444,44...
4 482,483,485,487,488,489,490,491,493,495
.. ...
472 29568,29570,29571,29572,29573,29574,29575,29577
473 29581,29583,29584,29585,29586,29587,29588,2958...
474 29613,29615,29616,29617,29618,29619,29620,2962...
475 29640,29641,29643,29645,29647,29648,29649,2965...
476 29654,29655,29656,29657,29659,29660,29661,2966...
[477 rows x 4 columns]