mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-07 22:44:25 +08:00
290 lines
7.1 KiB
Python
Executable File
290 lines
7.1 KiB
Python
Executable File
# $Id$
|
|
#
|
|
# Copyright (C) 2003-2006 Rational Discovery LLC
|
|
#
|
|
# @@ All Rights Reserved @@
|
|
#
|
|
""" Tools for doing PubMed searches and processing the results
|
|
|
|
NOTE: much of the example code in the documentation here uses XML
|
|
files from the test_data directory in order to avoid having to call
|
|
out to PubMed itself. Actual calls to the functions would not include
|
|
the _conn_ argument.
|
|
|
|
"""
|
|
import RDConfig
|
|
import QueryParams,Records
|
|
import urllib,urllib2
|
|
from elementtree import ElementTree
|
|
|
|
def GetNumHits(query,url=QueryParams.searchBase):
|
|
""" returns a tuple of pubmed ids (strings) for the query provided
|
|
|
|
To do a search, we need a query object:
|
|
>>> query = QueryParams.details()
|
|
|
|
set up the search parameters:
|
|
>>> query['term'] = 'penzotti je AND grootenhuis pd'
|
|
>>> query['field'] = 'auth'
|
|
|
|
now get the search ids:
|
|
>>> counts = GetNumHits(query)
|
|
>>> counts
|
|
2
|
|
|
|
alternately, we can search using field specifiers:
|
|
>>> query = QueryParams.details()
|
|
>>> query['term'] = 'penzotti je[au] AND hydrogen bonding[mh]'
|
|
>>> counts = GetNumHits(query)
|
|
>>> counts
|
|
3
|
|
|
|
|
|
"""
|
|
query['rettype']='count'
|
|
conn = urllib2.urlopen(url,urllib.urlencode(query))
|
|
pubmed = ElementTree.parse(conn)
|
|
countText = pubmed.findtext('Count')
|
|
if countText:
|
|
res = int(countText)
|
|
else:
|
|
res = 0
|
|
return res
|
|
|
|
|
|
def GetSearchIds(query,url=QueryParams.searchBase):
|
|
""" returns a tuple of pubmed ids (strings) for the query provided
|
|
|
|
To do a search, we need a query object:
|
|
>>> query = QueryParams.details()
|
|
|
|
set up the search parameters:
|
|
>>> query['term'] = 'penzotti je AND grootenhuis pd'
|
|
>>> query['field'] = 'auth'
|
|
|
|
now get the search ids:
|
|
>>> ids = GetSearchIds(query)
|
|
>>> len(ids)
|
|
2
|
|
>>> ids[0]
|
|
'11960484'
|
|
>>> ids[1]
|
|
'10893315'
|
|
|
|
|
|
"""
|
|
conn = urllib2.urlopen(url,urllib.urlencode(query))
|
|
pubmed = ElementTree.parse(conn)
|
|
res = [id.text for id in pubmed.getiterator('Id')]
|
|
return tuple(res)
|
|
|
|
def GetSummaries(ids,query=None,url=QueryParams.summaryBase,conn=None):
|
|
""" gets a set of document summary records for the ids provided
|
|
|
|
>>> ids = ['11960484']
|
|
>>> summs = GetSummaries(ids,conn=open(os.path.join(testDataDir,'summary.xml'),'r'))
|
|
>>> len(summs)
|
|
1
|
|
>>> rec = summs[0]
|
|
>>> isinstance(rec,Records.SummaryRecord)
|
|
1
|
|
>>> rec.PubMedId
|
|
'11960484'
|
|
>>> rec.Authors
|
|
'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
|
|
>>> rec.Title
|
|
'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
|
|
>>> rec.Source
|
|
'J Med Chem'
|
|
>>> rec.Volume
|
|
'45'
|
|
>>> rec.Pages
|
|
'1737-40'
|
|
>>> rec.HasAbstract
|
|
'1'
|
|
|
|
"""
|
|
if not conn:
|
|
try:
|
|
iter(ids)
|
|
except TypeError:
|
|
ids = [ids,]
|
|
if not query:
|
|
query = QueryParams.details()
|
|
ids = map(str,ids)
|
|
query['id'] = ','.join(ids)
|
|
conn = urllib2.urlopen(url,urllib.urlencode(query))
|
|
pubmed = ElementTree.parse(conn)
|
|
res = []
|
|
for summary in pubmed.getiterator('DocSum'):
|
|
rec = Records.SummaryRecord(summary)
|
|
if rec.PubMedId in ids:
|
|
res.append(rec)
|
|
ids.remove(rec.PubMedId)
|
|
|
|
return tuple(res)
|
|
|
|
def GetRecords(ids,query=None,url=QueryParams.fetchBase,conn=None):
|
|
""" gets a set of document summary records for the ids provided
|
|
|
|
>>> ids = ['11960484']
|
|
>>> recs = GetRecords(ids,conn=open(os.path.join(testDataDir,'records.xml'),'r'))
|
|
>>> len(recs)
|
|
1
|
|
>>> rec = recs[0]
|
|
>>> rec.PubMedId
|
|
'11960484'
|
|
>>> rec.Authors
|
|
u'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
|
|
>>> rec.Title
|
|
u'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
|
|
>>> rec.Source
|
|
u'J Med Chem'
|
|
>>> rec.Volume
|
|
'45'
|
|
>>> rec.Pages
|
|
'1737-40'
|
|
>>> rec.PubYear
|
|
'2002'
|
|
>>> rec.Abstract[:10]
|
|
u'P-glycopro'
|
|
|
|
We've also got access to keywords:
|
|
>>> str(rec.keywords[0])
|
|
'Combinatorial Chemistry Techniques'
|
|
>>> str(rec.keywords[3])
|
|
'Indinavir / chemistry'
|
|
|
|
and chemicals:
|
|
>>> rec.chemicals[0]
|
|
'P-Glycoprotein'
|
|
>>> rec.chemicals[2]
|
|
'Nicardipine <55985-32-5>'
|
|
|
|
|
|
"""
|
|
if not conn:
|
|
try:
|
|
iter(ids)
|
|
except TypeError:
|
|
ids = [ids,]
|
|
if not query:
|
|
query = QueryParams.details()
|
|
query['id'] = ','.join(map(str,ids))
|
|
conn = urllib2.urlopen(url,urllib.urlencode(query))
|
|
|
|
pubmed = ElementTree.parse(conn)
|
|
res = []
|
|
for article in pubmed.getiterator('PubmedArticle'):
|
|
rec = Records.JournalArticleRecord(article)
|
|
if rec.PubMedId in ids:
|
|
res.append(rec)
|
|
return tuple(res)
|
|
|
|
def CheckForLinks(ids,query=None,url=QueryParams.linkBase,conn=None):
|
|
if not conn:
|
|
try:
|
|
iter(ids)
|
|
except TypeError:
|
|
ids = [ids,]
|
|
if not query:
|
|
query = QueryParams.details()
|
|
query['id'] = ','.join(map(str,ids))
|
|
conn = urllib2.urlopen(url,urllib.urlencode(query))
|
|
query['cmd'] = 'ncheck'
|
|
pubmed = ElementTree.parse(conn)
|
|
|
|
checklist = pubmed.find('LinkSet/IdCheckList')
|
|
recs = [Records.LinkRecord(x) for x in checklist.getiterator('Id')]
|
|
|
|
res = {}
|
|
for rec in recs:
|
|
id = rec.PubMedId
|
|
res[id] = rec.HasNeighbor
|
|
return res
|
|
|
|
def GetLinks(ids,query=None,url=QueryParams.linkBase,conn=None):
|
|
if not conn:
|
|
try:
|
|
iter(ids)
|
|
except TypeError:
|
|
ids = [ids,]
|
|
if not query:
|
|
query = QueryParams.details()
|
|
query['id'] = ','.join(map(str,ids))
|
|
conn = urllib2.urlopen(url,urllib.urlencode(query))
|
|
query['cmd'] = 'neighbor'
|
|
|
|
pubmed = ElementTree.parse(conn)
|
|
linkset = pubmed.find('LinkSet/LinkSetDb')
|
|
scores = []
|
|
scoreNorm = 1.0
|
|
for link in linkset.getiterator('Link'):
|
|
id = link.findtext('Id')
|
|
score = float(link.findtext('Score'))
|
|
scores.append([id,score])
|
|
# we'll normalize scores by the score for the first of the query ids:
|
|
if id == ids[0]:
|
|
scoreNorm = score
|
|
for i in range(len(scores)):
|
|
id,score = scores[i]
|
|
scores[i] = id,score/scoreNorm
|
|
return tuple(scores)
|
|
|
|
|
|
#------------------------------------
|
|
#
|
|
# doctest boilerplate
|
|
#
|
|
def _test():
|
|
import doctest,sys
|
|
return doctest.testmod(sys.modules["__main__"])
|
|
|
|
if __name__ == '__main__':
|
|
import sys,os.path
|
|
testDataDir = os.path.join(RDConfig.RDCodeDir,'Dbase','Pubmed','test_data')
|
|
failed,tried = _test()
|
|
sys.exit(failed)
|
|
#query = QueryParams.details()
|
|
#query['term']='landrum ga'
|
|
#query['field']='auth'
|
|
#ids = GetSearchIds(query)
|
|
#print ids
|
|
#ids = ids[:2]
|
|
ids = ['11666868','11169640']
|
|
if 0:
|
|
summs = GetSummaries(ids,conn=open('summary.xml','r'))
|
|
print 'summs:',summs
|
|
for summary in summs:
|
|
print summary.Authors
|
|
print '\t',summary.Title
|
|
print '\t',summary.Source,
|
|
print summary.Volume,
|
|
print summary.Pages,
|
|
print summary.PubDate
|
|
|
|
if 0:
|
|
ids = ['11666868']
|
|
res = GetRecords(ids,conn=open('records.xml','r'))
|
|
for record in res:
|
|
print record.Authors
|
|
print '\t',record.Title
|
|
print '\t',record.Journal,
|
|
print record.Volume,
|
|
print record.Pages,
|
|
print record.PubYear
|
|
print
|
|
|
|
if 0:
|
|
ids = ['11666868','11169640']
|
|
res = CheckForLinks(ids,conn=open('haslinks.xml','r'))
|
|
print res
|
|
|
|
if 0:
|
|
ids = ['11666868']
|
|
res = GetLinks(ids,conn=open('links.xml','r'))
|
|
#res = GetLinks(ids)
|
|
for id,score in res[:10]:
|
|
print id,score
|
|
|