Selenium: Influenza fasta 파일 크롤링 #
#2025-07-28
1. Load package #
import pandas as pd
import numpy as np
import os
#
2. Set path #
os.chdir('/Users/yshmbid/Desktop/workspace/gisaid')
os.getcwd()
'/Users/yshmbid/Desktop/workspace/gisaid'
#
3. Run crawling #
# ChromeDriver 경로를 설치하고 Service 객체로 전달
chrome_service = Service(ChromeDriverManager().install())
try:
# ChromeDriver 실행
crawler = webdriver.Chrome(service=chrome_service)
except:
# 크롬드라이버가 없을 때 autoinstaller로 설치
chromedriver_autoinstaller.install(True)
crawler = webdriver.Chrome(service=chrome_service)
crawler.implicitly_wait(6) # 크롤러 대기 시간 설정
crawler.get('https://gisaid.org/') # 웹사이트 열기
# login 선택
engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="menuequer"]/li[7]/a')))
engine.click()
# id 입력
WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="elogin"]')))
engine = crawler.find_element(By.XPATH, '//*[@id="elogin"]')
crawler.execute_script("arguments[0].click();", engine)
engine.send_keys('*') # *: id 블라인드 처리
# pw 입력
#WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="login"]/div[2]/br[3]')))
engine = crawler.find_element(By.XPATH, '//*[@id="epassword"]')
crawler.execute_script("arguments[0].click();", engine)
engine.send_keys('*') # *: pw 블라인드 처리
#engine = crawler.find_element(By.XPATH, '//*[@id="login"]/div[2]/input[3]')
engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="login"]/div[2]/input[3]')))
engine.click()
# epiflu 선택
engine = crawler.find_element(By.XPATH, '//*[@id="main_nav"]/ul/li[2]/a')
engine.click()
# search&browse 선택
#engine = crawler.find_element(By.XPATH, '//*[@id="c_sjk17x_ey-c_sjk17x_ey"]/div/div/div[7]/div')
engine = crawler.find_element(By.XPATH, '//*[@id="c_sjlgnx_11g-c_sjlgnx_11g"]/div/div/div[7]/div')
#engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="c_sjk17x_ey-c_sjk17x_ey"]/div/div/div[7]/div')))
engine.click()
# collection-date 선택
#engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjk17x_q_input"]')
engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjlgnx_hv_input"]')
engine.click()
engine.send_keys('2024-01-01')
#engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjk17x_r_input"]')
engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjlgnx_hw_input"]')
engine.click()
engine.send_keys('2024-02-01')
# search 선택
#engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ce_sjk17x_1p"]/div/button')))
#engine.click()
#engine = crawler.find_element(By.XPATH, '//*[@id="ce_sjlgnx_iu"]/div/button')
engine = WebDriverWait(crawler, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ce_sjlgnx_iu"]/div/button')))
engine.click()
# check all
engine = crawler.find_element(By.XPATH, '//*[@id="yui-dt0-th-c-liner"]/span/input')
engine.click()
#
- 분명 알고리즘 개발에는 22만개 sequence를 사용했는데 validation set으로 190만개 sequence를 쓰는게 맞았을까 생각했던작업
- 그와중에 GISAID의 xpath가 매일 업데이트돼서 매일아침 코드 수정해가면서 돌렸던기억이 있다
- 그리구 이상한게 핫스팟연결하면 오류나가지고 이기간엔 라운지도 못가고 연구실에만 있었어야했다..