aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan DeMasi <jonathan.demasi@colorado.edu>2019-06-18 15:07:00 -0600
committerJonathan DeMasi <jonathan.demasi@colorado.edu>2019-06-18 15:07:00 -0600
commita771ae27945bdda68740f3627a95a24da25fa5f6 (patch)
treea5e639f860eed60205f5cb2a6fa9508b084fea5e
parentd85036e461cbb9dc360cba8791715c81e948d920 (diff)
downloadsnippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar
snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.gz
snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.bz2
snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.lz
snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.xz
snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.tar.zst
snippets-a771ae27945bdda68740f3627a95a24da25fa5f6.zip
fetching abstracts
-rwxr-xr-xncbi/dbsnp/dbsnp.py56
-rw-r--r--ncbi/dbsnp/ncbiutils.py26
2 files changed, 37 insertions, 45 deletions
diff --git a/ncbi/dbsnp/dbsnp.py b/ncbi/dbsnp/dbsnp.py
index abbe326..3861d96 100755
--- a/ncbi/dbsnp/dbsnp.py
+++ b/ncbi/dbsnp/dbsnp.py
@@ -1,16 +1,14 @@
#!/usr/bin/env python3
-import time
import ncbiutils
-from Bio import Entrez
-
+from lxml import etree
"""
Finds all rsids that are explicitly cited in pubmed
and returns a list
"""
def get_complete_rsids():
- results = ncbiutils.db_query(db="snp",retmode="json",retmax=200000,retstart=0,term='snp_pubmed_cited[sb]')
+ results = ncbiutils.esearch(db="snp", retmode="json", retmax=200000, retstart=0, term='snp_pubmed_cited[sb]')
rsidlist = results["esearchresult"]["idlist"]
for x in range(0, len(rsidlist)):
rsidlist[x] = "rs" + rsidlist[x]
@@ -22,54 +20,30 @@ Generates a list of PMIDs that are explicitly cite a given rsid
def get_pmids(rsid):
searchterm = rsid + "+AND+pubmed_snp_cited[sb]"
print(searchterm)
- results = ncbiutils.db_query(db="pubmed",retmode="json",retmax=200000,restart=0,term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09")
+ results = ncbiutils.esearch(db="pubmed", retmode="json", retmax=200000, restart=0, term=searchterm, api_key="7c0213f7c513fa71fe2cb65b4dfefa76fb09")
pmidlist = results["esearchresult"]["idlist"]
print(pmidlist)
return(pmidlist)
"""
-Takes the saved list of results from get_pmids and retrieves
-the raw XML to parse. Currently goes article by article instead of
-doing them in bulk. Could go either way. Unsure which way
-is better / more efficient.
+Takes a pmid and returns the abstract text
"""
-
-def get_abstracts(results):
- abstracts_list = []
- for start in range(0, int(results["Count"]), 1):
- # print("Going to download record %i to %i" % (start+1, end))
- fetch_handle = Entrez.efetch(db="pubmed", rettype="abstract",
- retmode="xml", retstart=start,
- retmax=1,
- webenv=results["WebEnv"],
- query_key=results["QueryKey"])
- data = fetch_handle.read()
- fetch_handle.close()
- root = ET.fromstring(data)
- for abst in root.iter('Abstract'):
- for sec in abst.iter('AbstractText'):
- abstracts_list.append(sec.text)
- print(abstracts_list)
- return(abstracts_list)
-
-
-def get_abstracts_from_list(pmids_list):
- abstracts_list = []
- for each_pmid in pmids_list:
- fetch_handle = Entrez.efetch(db="pubmed", id=each_pmid, retmode='xml')
- data = fetch_handle.read()
- fetch_handle.close()
- root = ET.fromstring(data)
- for abst in root.iter('Abstract'):
- for sec in abst.iter('AbstractText'):
- abstracts_list.append(sec.text)
- return(abstracts_list)
+def get_abstract(pmid):
+ raw = ncbiutils.efetch(db="pubmed", id=pmid, rettype="abstract")
+ xml = etree.fromstring(raw)
+ abstracts = []
+ for a in xml.xpath('//AbstractText'):
+ abstracts.append(a.text)
+ print(abstracts)
+ return()
def main():
rsids = get_complete_rsids()
for rsid in rsids:
- get_pmids(rsid)
+ pmids = get_pmids(rsid)
+ for pmid in pmids:
+ get_abstract(pmid)
return()
if __name__ == '__main__':
diff --git a/ncbi/dbsnp/ncbiutils.py b/ncbi/dbsnp/ncbiutils.py
index b3c199a..9bc1c94 100644
--- a/ncbi/dbsnp/ncbiutils.py
+++ b/ncbi/dbsnp/ncbiutils.py
@@ -2,10 +2,11 @@
import requests
-BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
-
-# Example https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=snp&term=snp_pubmed_cited[sb]&retmax=200000&retstart=1000&retmode=json
-def db_query(**kwargs):
+"""
+Used to make an esearch and get the results back in json
+"""
+def esearch(**kwargs):
+ BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
args = []
for key, value in kwargs.items():
args.append(key+"="+str(value))
@@ -15,4 +16,21 @@ def db_query(**kwargs):
results = resp.json()
return(results)
else:
+ print("You've encountered an error and we can't return your results")
+
+"""
+Used for an efetch, which is primarily to query specific IDs in dbsnp or pubmed
+Doesn't return json, but must return XML, apparently.
+"""
+def efetch(**kwargs):
+ BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
+ args = []
+ for key, value in kwargs.items():
+ args.append(key+"="+str(value))
+ qstring = "&".join(args)
+ resp = requests.get(BASE_URL + qstring)
+ if resp.status_code == 200:
+ results = resp.text
+ return(results)
+ else:
print("You've encountered an error and we can't return your results") \ No newline at end of file