diff options
author | Jonathan DeMasi <jonathan.demasi@colorado.edu> | 2019-06-18 15:07:00 -0600 |
---|---|---|
committer | Jonathan DeMasi <jonathan.demasi@colorado.edu> | 2019-06-18 15:07:00 -0600 |
commit | a771ae27945bdda68740f3627a95a24da25fa5f6 (patch) | |
tree | a5e639f860eed60205f5cb2a6fa9508b084fea5e /ncbi/dbsnp/dbsnp.py | |
parent | d85036e461cbb9dc360cba8791715c81e948d920 (diff) | |
download | snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.gz snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.bz2 snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.lz snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.xz snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.zst snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.zip |
fetching abstracts
Diffstat (limited to 'ncbi/dbsnp/dbsnp.py')
-rwxr-xr-x | ncbi/dbsnp/dbsnp.py | 56 |
1 files changed, 15 insertions, 41 deletions
diff --git a/ncbi/dbsnp/dbsnp.py b/ncbi/dbsnp/dbsnp.py index abbe326..3861d96 100755 --- a/ncbi/dbsnp/dbsnp.py +++ b/ncbi/dbsnp/dbsnp.py @@ -1,16 +1,14 @@ #!/usr/bin/env python3 -import time import ncbiutils -from Bio import Entrez - +from lxml import etree """ Finds all rsids that are explicitly cited in pubmed and returns a list """ def get_complete_rsids(): - results = ncbiutils.db_query(db="snp",retmode="json",retmax=200000,retstart=0,term='snp_pubmed_cited[sb]') + results = ncbiutils.esearch(db="snp", retmode="json", retmax=200000, retstart=0, term='snp_pubmed_cited[sb]') rsidlist = results["esearchresult"]["idlist"] for x in range(0, len(rsidlist)): rsidlist[x] = "rs" + rsidlist[x] @@ -22,54 +20,30 @@ Generates a list of PMIDs that are explicitly cite a given rsid def get_pmids(rsid): searchterm = rsid + "+AND+pubmed_snp_cited[sb]" print(searchterm) - results = ncbiutils.db_query(db="pubmed",retmode="json",retmax=200000,restart=0,term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09") + results = ncbiutils.esearch(db="pubmed", retmode="json", retmax=200000, restart=0, term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09") pmidlist = results["esearchresult"]["idlist"] print(pmidlist) return(pmidlist) """ -Takes the saved list of results from get_pmids and retrieves -the raw XML to parse. Currently goes article by article instead of -doing them in bulk. Could go either way. Unsure which way -is better / more efficient. +Takes a pmid and returns the abstract text """ - -def get_abstracts(results): - abstracts_list = [] - for start in range(0, int(results["Count"]), 1): - # print("Going to download record %i to %i" % (start+1, end)) - fetch_handle = Entrez.efetch(db="pubmed", rettype="abstract", - retmode="xml", retstart=start, - retmax=1, - webenv=results["WebEnv"], - query_key=results["QueryKey"]) - data = fetch_handle.read() - fetch_handle.close() - root = ET.fromstring(data) - for abst in root.iter('Abstract'): - for sec in abst.iter('AbstractText'): - abstracts_list.append(sec.text) - print(abstracts_list) - return(abstracts_list) - - -def get_abstracts_from_list(pmids_list): - abstracts_list = [] - for each_pmid in pmids_list: - fetch_handle = Entrez.efetch(db="pubmed", id=each_pmid, retmode='xml') - data = fetch_handle.read() - fetch_handle.close() - root = ET.fromstring(data) - for abst in root.iter('Abstract'): - for sec in abst.iter('AbstractText'): - abstracts_list.append(sec.text) - return(abstracts_list) +def get_abstract(pmid): + raw = ncbiutils.efetch(db="pubmed", id=pmid, rettype="abstract") + xml = etree.fromstring(raw) + abstracts = [] + for a in xml.xpath('//AbstractText'): + abstracts.append(a.text) + print(abstracts) + return() def main(): rsids = get_complete_rsids() for rsid in rsids: - get_pmids(rsid) + pmids = get_pmids(rsid) + for pmid in pmids: + get_abstract(pmid) return() if __name__ == '__main__': |