From a771ae27945bdda68740f3627a95a24da25fa5f6 Mon Sep 17 00:00:00 2001
From: Jonathan DeMasi <jonathan.demasi@colorado.edu>
Date: Tue, 18 Jun 2019 15:07:00 -0600
Subject: fetching abstracts

---
 ncbi/dbsnp/dbsnp.py     | 56 +++++++++++++------------------------------------
 ncbi/dbsnp/ncbiutils.py | 26 +++++++++++++++++++----
 2 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/ncbi/dbsnp/dbsnp.py b/ncbi/dbsnp/dbsnp.py
index abbe326..3861d96 100755
--- a/ncbi/dbsnp/dbsnp.py
+++ b/ncbi/dbsnp/dbsnp.py
@@ -1,16 +1,14 @@
 #!/usr/bin/env python3
 
-import time
 import ncbiutils
-from Bio import Entrez
-
+from lxml import etree
 
 """
 Finds all rsids that are explicitly cited in pubmed
 and returns a list
 """
 def get_complete_rsids():
-    results = ncbiutils.db_query(db="snp",retmode="json",retmax=200000,retstart=0,term='snp_pubmed_cited[sb]')
+    results = ncbiutils.esearch(db="snp", retmode="json", retmax=200000, retstart=0, term='snp_pubmed_cited[sb]')
     rsidlist = results["esearchresult"]["idlist"]
     for x in range(0, len(rsidlist)):
         rsidlist[x] = "rs" + rsidlist[x]
@@ -22,54 +20,30 @@ Generates a list of PMIDs that are explicitly cite a given rsid
 def get_pmids(rsid):
     searchterm = rsid + "+AND+pubmed_snp_cited[sb]"
     print(searchterm)
-    results = ncbiutils.db_query(db="pubmed",retmode="json",retmax=200000,restart=0,term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09")
+    results = ncbiutils.esearch(db="pubmed", retmode="json", retmax=200000, restart=0, term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09")
     pmidlist = results["esearchresult"]["idlist"]
     print(pmidlist)
     return(pmidlist)
 
 
 """
-Takes the saved list of results from get_pmids and retrieves
-the raw XML to parse.  Currently goes article by article instead of
-doing them in bulk.  Could go either way. Unsure which way
-is better / more efficient.
+Takes a pmid and returns the abstract text
 """
-
-def get_abstracts(results):
-    abstracts_list = []
-    for start in range(0, int(results["Count"]), 1):
-        # print("Going to download record %i to %i" % (start+1, end))
-        fetch_handle = Entrez.efetch(db="pubmed", rettype="abstract",
-                                        retmode="xml", retstart=start,
-                                        retmax=1,
-                                        webenv=results["WebEnv"],
-                                        query_key=results["QueryKey"])
-        data = fetch_handle.read()
-        fetch_handle.close()
-        root = ET.fromstring(data)
-        for abst in root.iter('Abstract'):
-            for sec in abst.iter('AbstractText'):
-                abstracts_list.append(sec.text)
-    print(abstracts_list)
-    return(abstracts_list)
-
-
-def get_abstracts_from_list(pmids_list):
-    abstracts_list = []
-    for each_pmid in pmids_list:
-        fetch_handle = Entrez.efetch(db="pubmed", id=each_pmid, retmode='xml')
-        data = fetch_handle.read()
-        fetch_handle.close()
-        root = ET.fromstring(data)
-        for abst in root.iter('Abstract'):
-            for sec in abst.iter('AbstractText'):
-                abstracts_list.append(sec.text)
-    return(abstracts_list)
+def get_abstract(pmid):
+    raw = ncbiutils.efetch(db="pubmed", id=pmid, rettype="abstract")
+    xml = etree.fromstring(raw)
+    abstracts = []
+    for a in xml.xpath('//AbstractText'):
+        abstracts.append(a.text)
+    print(abstracts)
+    return()
 
 def main():
     rsids = get_complete_rsids()
     for rsid in rsids:
-        get_pmids(rsid)
+        pmids = get_pmids(rsid)
+        for pmid in pmids:
+            get_abstract(pmid)
     return()
 
 if __name__ == '__main__':
diff --git a/ncbi/dbsnp/ncbiutils.py b/ncbi/dbsnp/ncbiutils.py
index b3c199a..9bc1c94 100644
--- a/ncbi/dbsnp/ncbiutils.py
+++ b/ncbi/dbsnp/ncbiutils.py
@@ -2,10 +2,11 @@
 
 import requests
 
-BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
-
-# Example https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=snp&term=snp_pubmed_cited[sb]&retmax=200000&retstart=1000&retmode=json
-def db_query(**kwargs):
+"""
+Used to make an esearch and get the results back in json
+"""
+def esearch(**kwargs):
+    BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
     args = []
     for key, value in kwargs.items():
         args.append(key+"="+str(value))
@@ -14,5 +15,22 @@ def db_query(**kwargs):
     if resp.status_code == 200:
         results = resp.json()
         return(results)
+    else:
+        print("You've encountered an error and we can't return your results")
+
+"""
+Used for an efetch, which is primarily to query specific IDs in dbsnp or pubmed
+Doesn't return json, but must return XML, apparently.  
+"""
+def efetch(**kwargs):
+    BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
+    args = []
+    for key, value in kwargs.items():
+        args.append(key+"="+str(value))
+    qstring = "&".join(args)
+    resp = requests.get(BASE_URL + qstring)
+    if resp.status_code == 200:
+        results = resp.text
+        return(results)
     else:
         print("You've encountered an error and we can't return your results")
\ No newline at end of file
-- 
cgit v1.2.3