Selenium: Influenza fasta 파일 크롤링

Selenium: Influenza fasta 파일 크롤링 #

#2025-07-28


1. Load package #

import pandas as pd
import numpy as np
import os

#

2. Set path #

os.chdir('/Users/yshmbid/Desktop/workspace/gisaid')
os.getcwd()
'/Users/yshmbid/Desktop/workspace/gisaid'

#

3. Run crawling #

# ChromeDriver 경로를 설치하고 Service 객체로 전달
chrome_service = Service(ChromeDriverManager().install())

try:
    # ChromeDriver 실행
    crawler = webdriver.Chrome(service=chrome_service)
except:
    # 크롬드라이버가 없을 때 autoinstaller로 설치
    chromedriver_autoinstaller.install(True)
    crawler = webdriver.Chrome(service=chrome_service)

crawler.implicitly_wait(6)  # 크롤러 대기 시간 설정
crawler.get('https://gisaid.org/')  # 웹사이트 열기

# login 선택
engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="menuequer"]/li[7]/a')))
engine.click()

# id 입력
WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="elogin"]')))
engine = crawler.find_element(By.XPATH, '//*[@id="elogin"]')
crawler.execute_script("arguments[0].click();", engine)
engine.send_keys('*') # *: id 블라인드 처리

# pw 입력
#WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="login"]/div[2]/br[3]')))
engine = crawler.find_element(By.XPATH, '//*[@id="epassword"]')
crawler.execute_script("arguments[0].click();", engine)
engine.send_keys('*')  # *: pw 블라인드 처리

#engine = crawler.find_element(By.XPATH, '//*[@id="login"]/div[2]/input[3]')
engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="login"]/div[2]/input[3]')))
engine.click()
# epiflu 선택
engine = crawler.find_element(By.XPATH, '//*[@id="main_nav"]/ul/li[2]/a')
engine.click()

# search&browse 선택
#engine = crawler.find_element(By.XPATH, '//*[@id="c_sjk17x_ey-c_sjk17x_ey"]/div/div/div[7]/div')
engine = crawler.find_element(By.XPATH, '//*[@id="c_sjlgnx_11g-c_sjlgnx_11g"]/div/div/div[7]/div')
#engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="c_sjk17x_ey-c_sjk17x_ey"]/div/div/div[7]/div')))
engine.click()

# collection-date 선택
#engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjk17x_q_input"]') 
engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjlgnx_hv_input"]')
engine.click()
engine.send_keys('2024-01-01')

#engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjk17x_r_input"]') 
engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjlgnx_hw_input"]')
engine.click()
engine.send_keys('2024-02-01')

# search 선택
#engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ce_sjk17x_1p"]/div/button')))
#engine.click()
#engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjlgnx_iu"]/div/button') 
engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ce_sjlgnx_iu"]/div/button')))
engine.click()
# check all
engine = crawler.find_element(By.XPATH, '//*[@id="yui-dt0-th-c-liner"]/span/input')
engine.click()

#

  • 분명 알고리즘 개발에는 22만개 sequence를 사용했는데 validation set으로 190만개 sequence를 쓰는게 맞았을까 생각했던작업
  • 그와중에 GISAID의 xpath가 매일 업데이트돼서 매일아침 코드 수정해가면서 돌렸던기억이 있다
  • 그리구 이상한게 핫스팟연결하면 오류나가지고 이기간엔 라운지도 못가고 연구실에만 있었어야했다..

#