diff options
Diffstat (limited to 'ncbi/dbsnp/dbsnp.py')
-rwxr-xr-x | ncbi/dbsnp/dbsnp.py | 56 |
1 files changed, 15 insertions, 41 deletions
diff --git a/ncbi/dbsnp/dbsnp.py b/ncbi/dbsnp/dbsnp.py index abbe326..3861d96 100755 --- a/ncbi/dbsnp/dbsnp.py +++ b/ncbi/dbsnp/dbsnp.py @@ -1,16 +1,14 @@ #!/usr/bin/env python3 -import time import ncbiutils -from Bio import Entrez - +from lxml import etree """ Finds all rsids that are explicitly cited in pubmed and returns a list """ def get_complete_rsids(): - results = ncbiutils.db_query(db="snp",retmode="json",retmax=200000,retstart=0,term='snp_pubmed_cited[sb]') + results = ncbiutils.esearch(db="snp", retmode="json", retmax=200000, retstart=0, term='snp_pubmed_cited[sb]') rsidlist = results["esearchresult"]["idlist"] for x in range(0, len(rsidlist)): rsidlist[x] = "rs" + rsidlist[x] @@ -22,54 +20,30 @@ Generates a list of PMIDs that are explicitly cite a given rsid def get_pmids(rsid): searchterm = rsid + "+AND+pubmed_snp_cited[sb]" print(searchterm) - results = ncbiutils.db_query(db="pubmed",retmode="json",retmax=200000,restart=0,term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09") + results = ncbiutils.esearch(db="pubmed", retmode="json", retmax=200000, restart=0, term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09") pmidlist = results["esearchresult"]["idlist"] print(pmidlist) return(pmidlist) """ -Takes the saved list of results from get_pmids and retrieves -the raw XML to parse. Currently goes article by article instead of -doing them in bulk. Could go either way. Unsure which way -is better / more efficient. +Takes a pmid and returns the abstract text """ - -def get_abstracts(results): - abstracts_list = [] - for start in range(0, int(results["Count"]), 1): - # print("Going to download record %i to %i" % (start+1, end)) - fetch_handle = Entrez.efetch(db="pubmed", rettype="abstract", - retmode="xml", retstart=start, - retmax=1, - webenv=results["WebEnv"], - query_key=results["QueryKey"]) - data = fetch_handle.read() - fetch_handle.close() - root = ET.fromstring(data) - for abst in root.iter('Abstract'): - for sec in abst.iter('AbstractText'): - abstracts_list.append(sec.text) - print(abstracts_list) - return(abstracts_list) - - -def get_abstracts_from_list(pmids_list): - abstracts_list = [] - for each_pmid in pmids_list: - fetch_handle = Entrez.efetch(db="pubmed", id=each_pmid, retmode='xml') - data = fetch_handle.read() - fetch_handle.close() - root = ET.fromstring(data) - for abst in root.iter('Abstract'): - for sec in abst.iter('AbstractText'): - abstracts_list.append(sec.text) - return(abstracts_list) +def get_abstract(pmid): + raw = ncbiutils.efetch(db="pubmed", id=pmid, rettype="abstract") + xml = etree.fromstring(raw) + abstracts = [] + for a in xml.xpath('//AbstractText'): + abstracts.append(a.text) + print(abstracts) + return() def main(): rsids = get_complete_rsids() for rsid in rsids: - get_pmids(rsid) + pmids = get_pmids(rsid) + for pmid in pmids: + get_abstract(pmid) return() if __name__ == '__main__': |