TFT #2 입력 feature 생성

TFT #2 입력 feature 생성 #

#2025-07-23


1. Load package #

%load_ext autoreload
%autoreload 2

import sys
import pandas as pd
import numpy as np
import os
import pickle
import ast

sys.path.append('/data3/projects/2025_Antibiotics/YSH/bin')
from sc import *

os.chdir('/data3/projects/2025_Antibiotics/YSH/workspace')

2. Make feature1 #

#data

/data
└── all_meds.txt

/data_knuch
└── sequence
     └── *.pkl (*: antibiotics)

/data_knuh
└── sequence
     └── *.pkl (*: antibiotics)

medinfo = '/data/all_meds.txt'

with open(medinfo, 'r') as f:
    meds = [line.strip().replace("/", "_") for line in f if line.strip()]

outdir = f'data_{dtype}'

strain_dic = {}

for med in meds:
    with open(f'data_{dtype}/sequence/{med}.pkl', 'rb') as f:
        res_dict = pickle.load(f)
        
    feature1_list = []

    for pid, df in res_dict.items():
        news_bf = df.iloc[2]['NEWS']  # 3번째 행 (0-indexed)
        news_af = df.iloc[3:]['NEWS'].max()  # 4번째 행부터 마지막까지 중 최댓값

        if news_af < news_bf:  # "작은" 경우만 (같은 건 포함하지 않음)
            feature1_list.append(pid)
    
    #print(len(feature1_list))
    filtered_res_dict = {pid: res_dict[pid] for pid in feature1_list if pid in res_dict}
    
    with open(f"data_{dtype}/temp/feature1/{med}.pkl", 'wb') as f:
        pickle.dump(filtered_res_dict, f)

    for pid, df in filtered_res_dict.items():
        if len(df) < 3:
            continue  

        try:
            cur_strain = df.iloc[2]['strain']
            if isinstance(cur_strain, list):
                strains = cur_strain
            else:
                strains = [cur_strain]
        except Exception as e:
            #print(med)
            continue

        for strain in strains:
            if strain in strain_dic:
                strain_dic[strain].append(med)
            else:
                strain_dic[strain] = [med]

for strain in strain_dic:
    strain_dic[strain] = list(set(strain_dic[strain]))
# Save feature1
with open(f"{outdir}/feature1.pkl", 'wb') as f:
    pickle.dump(strain_dic, f)

#result

/data
└── all_meds.txt

/data_knuch
├── sequence
│    └── *.pkl (*: antibiotics)
└── feature1.pkl

/data_knuh
├── sequence
│    └── *.pkl (*: antibiotics)
└── feature1.pkl

#functions

sc.py provided in github