爬虫代码:
#coding:utf-8
import requests, json, random, time
from bs4 import BeautifulSoup
def dig(drugbank_accession_number="DB00460"):
url = "https://go.drugbank.com/drugs/" + drugbank_accession_number
# url = "https://en.wikipedia.org/wiki/Verteporfin"
headers = {
"User-Agent": "User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
# 代理IP池
proxies = {
"http": "http://127.0.0.1:7890",
"https": "http://127.0.0.1:7890",
}
# proxy = random.choice(proxy_pool) # 随机选择代理IP
# # 发送请求获取响应
response = requests.get(url, headers=headers, proxies=proxies)
# print(response.text)
soup = BeautifulSoup(response.content, 'html.parser')
# soup = BeautifulSoup(hhhh(), 'html.parser')
# 提取Drug Name
drug_name = soup.find('dt', {'id': 'generic-name'}).find_next_sibling('dd').text.strip()
# # 提取DrugBank Accession Number
# drugbank_accession_number = soup.find('dt', {'id': 'drugbank-accession-number'}).find_next_sibling('dd').text.strip()
# 提取Background
background = soup.find('dt', {'id': 'background'}).find_next_sibling('dd').text.strip()
# 提取Type
type_value = soup.find('dt', {'id': 'type'}).find_next_sibling('dd').text.strip()
# 提取Chemical Formula
if soup.find('dt', {'id': 'chemical-formula'}):
chemical_formula = soup.find('dt', {'id': 'chemical-formula'}).find_next_sibling('dd').text.strip()
else:
chemical_formula = ''
# drug text
drug_text = ''
if background !='':
drug_text += background + ' '
if drug_name != '':
drug_text += drug_name
if type_value !='':
drug_text += ' is of the type {}'.format(type_value)
drug_text += ', number {}'.format(drugbank_accession_number)
if chemical_formula != '':
drug_text += ' and has the molecular formula {}.'.format(chemical_formula)
with open('drug_text.json', 'a', encoding='utf-8') as f:
f.write(json.dumps({drug_name: drug_text}, ensure_ascii=False) + '\n')
with open('drug_order_name.json', 'a', encoding='utf-8') as f:
f.write(json.dumps({drugbank_accession_number: drug_name}, ensure_ascii=False) + '\n')
# dig()
def main():
# 从0到1709找到每个药物的DrugBank Accession Number,然后调用dig函数获取相关信息
with open('id2node.json', 'r', encoding='utf-8') as f:
id2node = json.load(f)
for i in range(1007,len(id2node)):
drugbank_accession_number = id2node[str(i)]
print("{},{}".format(i,drugbank_accession_number), end='')
dig(drugbank_accession_number)
print(', over.')
time.sleep(3)
# break
main()
其中,
# 代理IP池
proxies = {
"http": "http://127.0.0.1:7890",
"https": "http://127.0.0.1:7890",
}
指的是本地的vpn代理,我用的是clash客户端,默认是"http://127.0.0.1:7890",