diff options
author | Jonathan DeMasi <jonathan.demasi@colorado.edu> | 2019-06-17 22:44:00 -0600 |
---|---|---|
committer | Jonathan DeMasi <jonathan.demasi@colorado.edu> | 2019-06-17 22:44:00 -0600 |
commit | 1abbe71dfaa852e92248f0d97b6178d67df2071f (patch) | |
tree | 3817ccd4ad5b0ab89735682fe6d882b5b22b68e4 /ncbi | |
parent | e951e4c41c23e0927ff53ddc263ca054691502d4 (diff) | |
download | snippets-1abbe71dfaa852e92248f0d97b6178d67df2071f.tar snippets-1abbe71dfaa852e92248f0d97b6178d67df2071f.tar.gz snippets-1abbe71dfaa852e92248f0d97b6178d67df2071f.tar.bz2 snippets-1abbe71dfaa852e92248f0d97b6178d67df2071f.tar.lz snippets-1abbe71dfaa852e92248f0d97b6178d67df2071f.tar.xz snippets-1abbe71dfaa852e92248f0d97b6178d67df2071f.tar.zst snippets-1abbe71dfaa852e92248f0d97b6178d67df2071f.zip |
Added other helper functions to get abstracts and such
Diffstat (limited to 'ncbi')
-rwxr-xr-x | ncbi/dbsnp/dbsnp.py | 42 |
1 files changed, 41 insertions, 1 deletions
diff --git a/ncbi/dbsnp/dbsnp.py b/ncbi/dbsnp/dbsnp.py index 00ccb07..07d51fa 100755 --- a/ncbi/dbsnp/dbsnp.py +++ b/ncbi/dbsnp/dbsnp.py @@ -54,6 +54,46 @@ def get_pmids(interm): search_results["Count"] + " results using search string'" + searchstring + "'") return(search_results) + +""" +Takes the saved list of results from get_pmids and retrieves +the raw XML to parse. Currently goes article by article instead of +doing them in bulk. Could go either way. Unsure which way +is better / more efficient. +""" + +def get_abstracts(results): + abstracts_list = [] + for start in range(0, int(results["Count"]), 1): + end = min(int(results["Count"]), start + 1) + # print("Going to download record %i to %i" % (start+1, end)) + fetch_handle = Entrez.efetch(db="pubmed", rettype="abstract", + retmode="xml", retstart=start, + retmax=1, + webenv=results["WebEnv"], + query_key=results["QueryKey"]) + data = fetch_handle.read() + fetch_handle.close() + root = ET.fromstring(data) + for abst in root.iter('Abstract'): + for sec in abst.iter('AbstractText'): + abstracts_list.append(sec.text) + return(abstracts_list) + + +def get_abstracts_from_list(pmids_list): + abstracts_list = [] + pmids_abstracts_dict = {} + for each_pmid in pmids_list: + fetch_handle = Entrez.efetch(db="pubmed", id=each_pmid, retmode='xml') + data = fetch_handle.read() + fetch_handle.close() + root = ET.fromstring(data) + for abst in root.iter('Abstract'): + for sec in abst.iter('AbstractText'): + abstracts_list.append(sec.text) + return(abstracts_list) + def main(): rsids = get_complete_rsids() if DEBUG: @@ -61,7 +101,7 @@ def main(): print(x) for x in rsids: get_pmids(x) - time.sleep(1) + time.sleep(0.5) return() if __name__ == '__main__': |