From a771ae27945bdda68740f3627a95a24da25fa5f6 Mon Sep 17 00:00:00 2001 From: Jonathan DeMasi Date: Tue, 18 Jun 2019 15:07:00 -0600 Subject: fetching abstracts --- ncbi/dbsnp/dbsnp.py | 56 +++++++++++++------------------------------------ ncbi/dbsnp/ncbiutils.py | 26 +++++++++++++++++++---- 2 files changed, 37 insertions(+), 45 deletions(-) diff --git a/ncbi/dbsnp/dbsnp.py b/ncbi/dbsnp/dbsnp.py index abbe326..3861d96 100755 --- a/ncbi/dbsnp/dbsnp.py +++ b/ncbi/dbsnp/dbsnp.py @@ -1,16 +1,14 @@ #!/usr/bin/env python3 -import time import ncbiutils -from Bio import Entrez - +from lxml import etree """ Finds all rsids that are explicitly cited in pubmed and returns a list """ def get_complete_rsids(): - results = ncbiutils.db_query(db="snp",retmode="json",retmax=200000,retstart=0,term='snp_pubmed_cited[sb]') + results = ncbiutils.esearch(db="snp", retmode="json", retmax=200000, retstart=0, term='snp_pubmed_cited[sb]') rsidlist = results["esearchresult"]["idlist"] for x in range(0, len(rsidlist)): rsidlist[x] = "rs" + rsidlist[x] @@ -22,54 +20,30 @@ Generates a list of PMIDs that are explicitly cite a given rsid def get_pmids(rsid): searchterm = rsid + "+AND+pubmed_snp_cited[sb]" print(searchterm) - results = ncbiutils.db_query(db="pubmed",retmode="json",retmax=200000,restart=0,term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09") + results = ncbiutils.esearch(db="pubmed", retmode="json", retmax=200000, restart=0, term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09") pmidlist = results["esearchresult"]["idlist"] print(pmidlist) return(pmidlist) """ -Takes the saved list of results from get_pmids and retrieves -the raw XML to parse. Currently goes article by article instead of -doing them in bulk. Could go either way. Unsure which way -is better / more efficient. +Takes a pmid and returns the abstract text """ - -def get_abstracts(results): - abstracts_list = [] - for start in range(0, int(results["Count"]), 1): - # print("Going to download record %i to %i" % (start+1, end)) - fetch_handle = Entrez.efetch(db="pubmed", rettype="abstract", - retmode="xml", retstart=start, - retmax=1, - webenv=results["WebEnv"], - query_key=results["QueryKey"]) - data = fetch_handle.read() - fetch_handle.close() - root = ET.fromstring(data) - for abst in root.iter('Abstract'): - for sec in abst.iter('AbstractText'): - abstracts_list.append(sec.text) - print(abstracts_list) - return(abstracts_list) - - -def get_abstracts_from_list(pmids_list): - abstracts_list = [] - for each_pmid in pmids_list: - fetch_handle = Entrez.efetch(db="pubmed", id=each_pmid, retmode='xml') - data = fetch_handle.read() - fetch_handle.close() - root = ET.fromstring(data) - for abst in root.iter('Abstract'): - for sec in abst.iter('AbstractText'): - abstracts_list.append(sec.text) - return(abstracts_list) +def get_abstract(pmid): + raw = ncbiutils.efetch(db="pubmed", id=pmid, rettype="abstract") + xml = etree.fromstring(raw) + abstracts = [] + for a in xml.xpath('//AbstractText'): + abstracts.append(a.text) + print(abstracts) + return() def main(): rsids = get_complete_rsids() for rsid in rsids: - get_pmids(rsid) + pmids = get_pmids(rsid) + for pmid in pmids: + get_abstract(pmid) return() if __name__ == '__main__': diff --git a/ncbi/dbsnp/ncbiutils.py b/ncbi/dbsnp/ncbiutils.py index b3c199a..9bc1c94 100644 --- a/ncbi/dbsnp/ncbiutils.py +++ b/ncbi/dbsnp/ncbiutils.py @@ -2,10 +2,11 @@ import requests -BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" - -# Example https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=snp&term=snp_pubmed_cited[sb]&retmax=200000&retstart=1000&retmode=json -def db_query(**kwargs): +""" +Used to make an esearch and get the results back in json +""" +def esearch(**kwargs): + BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" args = [] for key, value in kwargs.items(): args.append(key+"="+str(value)) @@ -14,5 +15,22 @@ def db_query(**kwargs): if resp.status_code == 200: results = resp.json() return(results) + else: + print("You've encountered an error and we can't return your results") + +""" +Used for an efetch, which is primarily to query specific IDs in dbsnp or pubmed +Doesn't return json, but must return XML, apparently. +""" +def efetch(**kwargs): + BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + args = [] + for key, value in kwargs.items(): + args.append(key+"="+str(value)) + qstring = "&".join(args) + resp = requests.get(BASE_URL + qstring) + if resp.status_code == 200: + results = resp.text + return(results) else: print("You've encountered an error and we can't return your results") \ No newline at end of file -- cgit v1.2.3