TFT #1 입력 시퀀스 생성 #
#2025-07-23
1. Load package #
%load_ext autoreload
%autoreload 2
import sys
import pandas as pd
import numpy as np
import os
import pickle
import ast
sys.path.append('/data3/projects/2025_Antibiotics/YSH/bin')
from sc import *
os.chdir('/data3/projects/2025_Antibiotics/YSH/workspace')
2. Load raw data #
#data
/data
├── PreprocessedData/
│ └── TimecourseData/
│ └── * (*: patient id)
│ ├── SeverityScore.csv
│ ├── Laboratory_processed.csv
│ └── Medication.csv
├── PreprocessedData_knuh/
│ └── (PreprocessedData와 동일)
└── 병원체자원은행 균주현황(2014-2024.06)_Sepsis.xlsx
/data_knuch
└── (empty)
/data_knuh
└── (empty)
data_knuch = '/data/PreprocessedData/TimecourseData'
data_knuh = '/data/PreprocessedData_knuh/TimecourseData'
pids = [d for d in os.listdir(data_knuch)] + [d for d in os.listdir(data_knuh)]
len(pids)
13779
3. Raw data processing #
#processing knuch
datadir = '/data/PreprocessedData/TimecourseData'
pids = [d for d in os.listdir(datadir)]
input_dict = make_input(datadir, pids)
input_dict, no_strains = add_strain(input_dict)
outdir = "data_knuch"
with open(f"{outdir}/Input.pkl", 'wb') as f:
pickle.dump(input_dict, f)
print(len(list(input_dict.keys())))
print(len(no_strains))
4516
4
#processing knuh
datadir = '/data/PreprocessedData_knuh/TimecourseData'
pids = [d for d in os.listdir(datadir)]
input_dict = make_input(datadir, pids)
input_dict, no_strains = add_strain(input_dict)
outdir = "data_knuh"
with open(f"{outdir}/Input.pkl", 'wb') as f:
pickle.dump(input_dict, f)
print(len(list(input_dict.keys())))
print(len(no_strains))
9100
1
#result
/data
├── PreprocessedData/
│ └── TimecourseData/
│ └── * (*: patient id)
│ ├── SeverityScore.csv
│ ├── Laboratory_processed.csv
│ └── Medication.csv
├── PreprocessedData_knuh/
│ └── (PreprocessedData와 동일)
└── 병원체자원은행 균주현황(2014-2024.06)_Sepsis.xlsx
/data_knuch
└── Input.pkl
/data_knuh
└── Input.pkl
4. Make input sequence #
#data
/data
└── all_meds.txt
/data_knuch
├── Input.pkl
└── sequence
└── (empty)
/data_knuh
├── Input.pkl
└── sequence
└── (empty)
dtype = 'knuh'
indir = f'data_{dtype}'
medinfo = '/data/all_meds.txt'
with open(medinfo, 'r') as f:
meds = [line.strip().replace("/", "_") for line in f if line.strip()]
with open(f"{indir}/Input.pkl", 'rb') as f:
input_dict = pickle.load(f)
pids = list(input_dict.keys())
outdir = f'data_{dtype}/sequence'
for med in meds:
make_sequence(med, indir, outdir)
#result
/data
└── all_meds.txt
/data_knuch
├── Input.pkl
└── sequence
└── *.pkl (*: antibiotics)
/data_knuh
├── Input.pkl
└── sequence
└── *.pkl (*: antibiotics)
#functions
sc.py
provided in github