rdkit/Python/Dbase/Pubmed/Searches.py

# $Id$
#
# Copyright (C) 2003-2006 Rational Discovery LLC
#
#   @@ All Rights Reserved  @@
#
""" Tools for doing PubMed searches and processing the results

NOTE: much of the example code in the documentation here uses XML
files from the test_data directory in order to avoid having to call
out to PubMed itself.  Actual calls to the functions would not include
the _conn_ argument.

"""
import RDConfig
import QueryParams,Records
import urllib,urllib2
from elementtree import ElementTree

def GetNumHits(query,url=QueryParams.searchBase):
  """ returns a tuple of pubmed ids (strings) for the query provided

  To do a search, we need a query object:
  >>> query = QueryParams.details()

  set up the search parameters:
  >>> query['term'] = 'penzotti je AND grootenhuis pd'
  >>> query['field'] = 'auth'

  now get the search ids:
  >>> counts = GetNumHits(query)
  >>> counts
  2

  alternately, we can search using field specifiers:
  >>> query = QueryParams.details()
  >>> query['term'] = 'penzotti je[au] AND hydrogen bonding[mh]'
  >>> counts = GetNumHits(query)
  >>> counts
  3


  """
  query['rettype']='count'
  conn = urllib2.urlopen(url,urllib.urlencode(query))
  pubmed = ElementTree.parse(conn)
  countText = pubmed.findtext('Count')
  if countText:
    res = int(countText)
  else:
    res = 0
  return res


def GetSearchIds(query,url=QueryParams.searchBase):
  """ returns a tuple of pubmed ids (strings) for the query provided

  To do a search, we need a query object:
  >>> query = QueryParams.details()

  set up the search parameters:
  >>> query['term'] = 'penzotti je AND grootenhuis pd'
  >>> query['field'] = 'auth'

  now get the search ids:
  >>> ids = GetSearchIds(query)
  >>> len(ids)
  2
  >>> ids[0]
  '11960484'
  >>> ids[1]
  '10893315'


  """
  conn = urllib2.urlopen(url,urllib.urlencode(query))
  pubmed = ElementTree.parse(conn)
  res = [id.text for id in pubmed.getiterator('Id')]
  return tuple(res)

def GetSummaries(ids,query=None,url=QueryParams.summaryBase,conn=None):
  """ gets a set of document summary records for the ids provided

  >>> ids = ['11960484']
  >>> summs = GetSummaries(ids,conn=open(os.path.join(testDataDir,'summary.xml'),'r'))
  >>> len(summs)
  1
  >>> rec = summs[0]
  >>> isinstance(rec,Records.SummaryRecord)
  1
  >>> rec.PubMedId
  '11960484'
  >>> rec.Authors
  'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
  >>> rec.Title
  'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
  >>> rec.Source
  'J Med Chem'
  >>> rec.Volume
  '45'
  >>> rec.Pages
  '1737-40'
  >>> rec.HasAbstract
  '1'

  """
  if not conn:
    try:
      iter(ids)
    except TypeError:
      ids = [ids,]
    if not query:
      query = QueryParams.details()
    ids = map(str,ids)
    query['id'] = ','.join(ids)
    conn = urllib2.urlopen(url,urllib.urlencode(query))
  pubmed = ElementTree.parse(conn)
  res = []
  for summary in pubmed.getiterator('DocSum'):
    rec = Records.SummaryRecord(summary)
    if rec.PubMedId in ids:
      res.append(rec)
      ids.remove(rec.PubMedId)

  return tuple(res)

def GetRecords(ids,query=None,url=QueryParams.fetchBase,conn=None):
  """ gets a set of document summary records for the ids provided

  >>> ids = ['11960484']
  >>> recs = GetRecords(ids,conn=open(os.path.join(testDataDir,'records.xml'),'r'))
  >>> len(recs)
  1
  >>> rec = recs[0]
  >>> rec.PubMedId
  '11960484'
  >>> rec.Authors
  u'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
  >>> rec.Title
  u'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
  >>> rec.Source
  u'J Med Chem'
  >>> rec.Volume
  '45'
  >>> rec.Pages
  '1737-40'
  >>> rec.PubYear
  '2002'
  >>> rec.Abstract[:10]
  u'P-glycopro'

  We've also got access to keywords:
  >>> str(rec.keywords[0])
  'Combinatorial Chemistry Techniques'
  >>> str(rec.keywords[3])
  'Indinavir / chemistry'

  and chemicals:
  >>> rec.chemicals[0]
  'P-Glycoprotein'
  >>> rec.chemicals[2]
  'Nicardipine <55985-32-5>'


  """
  if not conn:
    try:
      iter(ids)
    except TypeError:
      ids = [ids,]
    if not query:
      query = QueryParams.details()
    query['id'] = ','.join(map(str,ids))
    conn = urllib2.urlopen(url,urllib.urlencode(query))

  pubmed = ElementTree.parse(conn)
  res = []
  for article in pubmed.getiterator('PubmedArticle'):
    rec = Records.JournalArticleRecord(article)
    if rec.PubMedId in ids:
      res.append(rec)
  return tuple(res)

def CheckForLinks(ids,query=None,url=QueryParams.linkBase,conn=None):
  if not conn:
    try:
      iter(ids)
    except TypeError:
      ids = [ids,]
    if not query:
      query = QueryParams.details()
    query['id'] = ','.join(map(str,ids))
    conn = urllib2.urlopen(url,urllib.urlencode(query))
    query['cmd'] = 'ncheck'
  pubmed = ElementTree.parse(conn)

  checklist = pubmed.find('LinkSet/IdCheckList')
  recs = [Records.LinkRecord(x) for x in checklist.getiterator('Id')]

  res = {}
  for rec in recs:
    id = rec.PubMedId
    res[id] = rec.HasNeighbor
  return res

def GetLinks(ids,query=None,url=QueryParams.linkBase,conn=None):
  if not conn:
    try:
      iter(ids)
    except TypeError:
      ids = [ids,]
    if not query:
      query = QueryParams.details()
    query['id'] = ','.join(map(str,ids))
    conn = urllib2.urlopen(url,urllib.urlencode(query))
    query['cmd'] = 'neighbor'

  pubmed = ElementTree.parse(conn)
  linkset = pubmed.find('LinkSet/LinkSetDb')
  scores = []
  scoreNorm = 1.0
  for link in linkset.getiterator('Link'):
    id = link.findtext('Id')
    score = float(link.findtext('Score'))
    scores.append([id,score])
    # we'll normalize scores by the score for the first of the query ids:
    if id == ids[0]:
      scoreNorm = score
  for i in range(len(scores)):
    id,score = scores[i]
    scores[i] = id,score/scoreNorm
  return tuple(scores)


#------------------------------------
#
#  doctest boilerplate
#
def _test():
  import doctest,sys
  return doctest.testmod(sys.modules["__main__"])

if __name__ == '__main__':
  import sys,os.path
  testDataDir = os.path.join(RDConfig.RDCodeDir,'Dbase','Pubmed','test_data')
  failed,tried = _test()
  sys.exit(failed)
  #query = QueryParams.details()
  #query['term']='landrum ga'
  #query['field']='auth'
  #ids = GetSearchIds(query)
  #print ids
  #ids = ids[:2]
  ids = ['11666868','11169640']
  if 0:
    summs = GetSummaries(ids,conn=open('summary.xml','r'))
    print 'summs:',summs
    for summary in summs:
      print summary.Authors
      print '\t',summary.Title
      print '\t',summary.Source,
      print summary.Volume,
      print summary.Pages,
      print summary.PubDate

  if 0:
    ids = ['11666868']
    res = GetRecords(ids,conn=open('records.xml','r'))
    for record in res:
      print record.Authors
      print '\t',record.Title
      print '\t',record.Journal,
      print record.Volume,
      print record.Pages,
      print record.PubYear
      print

  if 0:
    ids = ['11666868','11169640']
    res = CheckForLinks(ids,conn=open('haslinks.xml','r'))
    print res

  if 0:
    ids = ['11666868']
    res = GetLinks(ids,conn=open('links.xml','r'))
    #res = GetLinks(ids)
    for id,score in res[:10]:
      print id,score