NCBI提供了丰富的接口,文档可参考:文档主目录 、 方法说明和参数设置 、 返回值的可选类型和模式 以及 九种接口简介
本文的一些参考资源(这些资料帮助了我这个很久不写python的假程序员):
Lxml库及Xpath语法详解
How to set the pandas dataframe data left/right alignment?
import requests
from lxml import etree
import time
import os
关键词作为初步检索的条件,待拿到abstract或summary后,可以进一步筛选信息
key_word = '((colon cancer) OR (colorectal cancer) OR (rectal cancer)) and ((radiation) OR (radiotherapy))'
# 'SI[gene]+AND+cancer'
search_results = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi',
params={'db': 'pubmed',
'term': key_word,
'usehistory':'y',
'RetMax':'10',
})
body=search_results.text
xml=etree.XML(body.encode(),etree.XMLParser())
webenv = xml.xpath('//WebEnv/text()')
QueryKey = xml.xpath('//QueryKey/text()')
Summary为XML格式结构化的完整信息
summary_results = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi',
params={'Query_key':QueryKey ,
'db': 'pubmed',
'WebEnv': webenv,
'retmode': 'text',
'version': '2.0'
})
body=summary_results.text
if (os.path.exists("PubMed") == False):
os.mkdir("PubMed")
file_name = "PubMed/summary_results_" + time.strftime("%Y%m%d_%H.%M", time.localtime()) + ".txt"
with open(file_name,"w",encoding='utf-8') as txt:
txt.write(body)
Fetch为非结构化的文本列表,pubmed的abstract主要使用这种方式获取
fetch_results = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
params={'Query_key':QueryKey ,
'db': 'pubmed',
'WebEnv': webenv,
'rettype': 'abstract',
#'rettype': 'Summary',
'retmode': 'text'
})
body=fetch_results.text
if (os.path.exists("PubMed") == False):
os.mkdir("PubMed")
file_name = "PubMed/fetch_results_" + time.strftime("%Y%m%d_%H.%M", time.localtime()) + ".txt"
with open(file_name,"w",encoding='utf-8') as txt:
txt.write(body)
search_results = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi',
params={'db': 'gds',
'term': key_word,
'usehistory':'y',
'RetMax':'10',
})
body=search_results.text
xml=etree.XML(body.encode(),etree.XMLParser())
webenv = xml.xpath('//WebEnv/text()')
QueryKey = xml.xpath('//QueryKey/text()')
Summary为XML格式结构化的完整信息,GEO的summary信息量较大
summary_results = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi',
params={'Query_key':QueryKey ,
'db': 'gds',
'WebEnv': webenv,
'retmode': 'text',
'version': '2.0'
})
body=summary_results.text
if (os.path.exists("GEO") == False):
os.mkdir("GEO")
file_name = "GEO/summary_results_" + time.strftime("%Y%m%d_%H.%M", time.localtime()) + ".txt"
with open(file_name,"w",encoding='utf-8') as txt:
txt.write(body)
Fetch为非结构化的文本列表
fetch_results = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
params={'Query_key':QueryKey ,
'db': 'gds',
'WebEnv': webenv,
'rettype': 'Summary',
'retmode': 'text'
})
body=fetch_results.text
if (os.path.exists("GEO") == False):
os.mkdir("GEO")
file_name = "GEO/fetch_results_" + time.strftime("%Y%m%d_%H.%M", time.localtime()) + ".txt"
with open(file_name,"w",encoding='utf-8') as txt:
txt.write(body)
body = summary_results.text
xml=etree.XML(body.encode(),etree.XMLParser())
summary = xml.xpath('//summary/text()')
title = xml.xpath('//title/text()')
DocumentSummary = xml.xpath('./DocumentSummarySet/DocumentSummary')
# len(DocumentSummary)
# 1150
# 下面循环可以写成 map 的形式,构建的函数需传入两个参数:一个是索引(数字),另一个是标签(字符串)
# 各个编程语言中非常重要的三种编程习惯:map用于简化“可并行”关系的循环,reduce用于简化“需串行”关系的循环,lambda用于匿名函数。
# 这种习惯借鉴于函数式编程(但严格的函数式编程是不允许命令式步骤的代码出现的),此外,其他的高级函数也可以多使用,比如filter、sort等
# 顺便回顾下java中的lambda写法,不仅可以实现匿名函数,还可以用于实现匿名内部类
# java中的lambda实现内部类可参考:https://www.cnblogs.com/coprince/p/8692972.html
search_range = range(0,len(DocumentSummary))
Accession = ["" for i in search_range]
title = ["" for i in search_range]
PDAT = ["" for i in search_range]
for i in search_range:
Accession[i] = DocumentSummary[i].xpath('./Accession/text()')
title[i] = DocumentSummary[i].xpath('./title/text()')
PDAT[i] = DocumentSummary[i].xpath('./PDAT/text()')
summary[1]
import re
n_pattens = 5
patterns = [re.compile('colon cancer'),
re.compile('rectal cancer'),
re.compile('radiation'),
re.compile('radiotherapy'),
re.compile('after')]
def match_info(data):
results = False
match_results = [0 for i in range(n_pattens)]
for i in range(n_pattens):
match_results[i] = len(re.findall(patterns[i],data))
if ( (match_results[0] > 0 or match_results[1] > 0) and (match_results[2] > 0 or match_results[3] > 0) and (match_results[4] > 0) ):
results = True
return(results)
# 当然,下面的循环可以写成 map 的形式(python里的map和R里面的map都是差不多的)
# 对于过滤符合搜索要求的数据,可以结合 filter 进行处理
# 写法如下:
# search_results = map(match_info, summary)
# match_index = list(filter( (lambda i : search_results[i]), search_range))
## 为了结构清晰,也可以先将lambda表达式定义成变量再传入filter,如index_bool = lambda i : search_results[i]
# Accession_match = Accession[match_index][0]
# Title_match = title[match_index][0]
# PDAT_match = PDAT[match_index][0]
#------------------------------------ for 写法 ------------------------------------
search_results = [ False for i in search_range]
Accession_match = []
Title_match = []
PDAT_match = []
for i in search_range:
search_results[i] = match_info(summary[i])
if (search_results[i]==True):
Accession_match.extend([Accession[i][0]])
Title_match.extend([title[i][0]])
PDAT_match.extend([PDAT[i][0]])
import pandas as pd
Match_results = pd.DataFrame({
'Accession': Accession_match,
'Title': Title_match,
'Date': PDAT_match
})
pd.set_option('max_colwidth',50)
pd.set_option('expand_frame_repr', True)
dfStyler = Match_results.style.set_properties(**{'text-align': 'left'})
dfStyler.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
dfStyler