Files
rdkit/Projects/DbCLI/SearchDb.py
2014-06-25 04:34:19 +02:00

612 lines
24 KiB
Python

# $Id$
#
# Copyright (c) 2007-2013, Novartis Institutes for BioMedical Research Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
# nor the names of its contributors may be used to endorse or promote
# products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Created by Greg Landrum, July 2007
#
from __future__ import print_function
_version = "0.14.0"
_usage="""
SearchDb [optional arguments] <sdfilename>
The sd filename argument can be either an SD file or an MDL mol
file.
NOTES:
- The property names may have been altered on loading the
database. Any non-alphanumeric character in a property name
will be replaced with '_'. e.g."Gold.Goldscore.Constraint.Score" becomes
"Gold_Goldscore_Constraint_Score".
- Property names are not case sensitive in the database.
"""
from rdkit import RDConfig
from rdkit.Dbase.DbConnection import DbConnect
from rdkit.RDLogger import logger
logger=logger()
import zlib
from rdkit import Chem
from rdkit.Chem.MolDb.FingerprintUtils import supportedSimilarityMethods,BuildSigFactory,DepickleFP,LayeredOptions
from rdkit.Chem.MolDb import FingerprintUtils
from rdkit import DataStructs
def _molFromPkl(pkl):
if isinstance(pkl,(bytes,str)):
mol = Chem.Mol(pkl)
else:
mol = Chem.Mol(str(pkl))
return mol
def GetNeighborLists(probes,topN,pool,
simMetric=DataStructs.DiceSimilarity,
simThresh=-1.,
silent=False,
**kwargs):
probeFps = [x[1] for x in probes]
validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None]
validFps=[probeFps[x] for x in validProbes]
from rdkit.DataStructs.TopNContainer import TopNContainer
if simThresh<=0:
nbrLists = [TopNContainer(topN) for x in range(len(probeFps))]
else:
nbrLists=[TopNContainer(-1) for x in range(len(probeFps))]
nDone=0
for nm,fp in pool:
nDone+=1
if not silent and not nDone%1000: logger.info(' searched %d rows'%nDone)
if(simMetric==DataStructs.DiceSimilarity):
scores = DataStructs.BulkDiceSimilarity(fp,validFps)
for i,score in enumerate(scores):
if score>simThresh:
nbrLists[validProbes[i]].Insert(score,nm)
elif(simMetric==DataStructs.TanimotoSimilarity):
scores = DataStructs.BulkTanimotoSimilarity(fp,validFps)
for i,score in enumerate(scores):
if score>simThresh:
nbrLists[validProbes[i]].Insert(score,nm)
elif(simMetric==DataStructs.TverskySimilarity):
av = float(kwargs.get('tverskyA',0.5))
bv = float(kwargs.get('tverskyB',0.5))
scores = DataStructs.BulkTverskySimilarity(fp,validFps,av,bv)
for i,score in enumerate(scores):
if score>simThresh:
nbrLists[validProbes[i]].Insert(score,nm)
else:
for i in range(len(probeFps)):
pfp = probeFps[i]
if pfp is not None:
score = simMetric(probeFps[i],fp)
if score>simThresh:
nbrLists[validProbes[i]].Insert(score,nm)
return nbrLists
def GetMolsFromSmilesFile(dataFilename,errFile,nameProp):
dataFile=open(dataFilename,'r')
for idx,line in enumerate(dataFile):
try:
smi,nm = line.strip().split(' ')
except:
continue
try:
m = Chem.MolFromSmiles(smi)
except:
m=None
if not m:
if errfile:
print(idx,nm,smi,file=errfile)
continue
yield (nm,smi,m)
def GetMolsFromSDFile(dataFilename,errFile,nameProp):
suppl = Chem.SDMolSupplier(dataFilename)
for idx,m in enumerate(suppl):
if not m:
if errFile:
if hasattr(suppl,'GetItemText'):
d = suppl.GetItemText(idx)
errFile.write(d)
else:
logger.warning('full error file support not complete')
continue
smi = Chem.MolToSmiles(m,True)
if m.HasProp(nameProp):
nm = m.GetProp(nameProp)
if not nm:
logger.warning('molecule found with empty name property')
else:
nm = 'Mol_%d'%(idx+1)
yield nm,smi,m
def RunSearch(options,queryFilename):
global sigFactory
if options.similarityType=='AtomPairs':
fpBuilder=FingerprintUtils.BuildAtomPairFP
simMetric=DataStructs.DiceSimilarity
dbName = os.path.join(options.dbDir,options.pairDbName)
fpTableName = options.pairTableName
fpColName = options.pairColName
elif options.similarityType=='TopologicalTorsions':
fpBuilder=FingerprintUtils.BuildTorsionsFP
simMetric=DataStructs.DiceSimilarity
dbName = os.path.join(options.dbDir,options.torsionsDbName)
fpTableName = options.torsionsTableName
fpColName = options.torsionsColName
elif options.similarityType=='RDK':
fpBuilder=FingerprintUtils.BuildRDKitFP
simMetric=DataStructs.FingerprintSimilarity
dbName = os.path.join(options.dbDir,options.fpDbName)
fpTableName = options.fpTableName
if not options.fpColName:
options.fpColName='rdkfp'
fpColName = options.fpColName
elif options.similarityType=='Pharm2D':
fpBuilder=FingerprintUtils.BuildPharm2DFP
simMetric=DataStructs.DiceSimilarity
dbName = os.path.join(options.dbDir,options.fpDbName)
fpTableName = options.pharm2DTableName
if not options.fpColName:
options.fpColName='pharm2dfp'
fpColName = options.fpColName
FingerprintUtils.sigFactory = BuildSigFactory(options)
elif options.similarityType=='Gobbi2D':
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D
fpBuilder=FingerprintUtils.BuildPharm2DFP
simMetric=DataStructs.TanimotoSimilarity
dbName = os.path.join(options.dbDir,options.fpDbName)
fpTableName = options.gobbi2DTableName
if not options.fpColName:
options.fpColName='gobbi2dfp'
fpColName = options.fpColName
FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory
elif options.similarityType=='Morgan':
fpBuilder=FingerprintUtils.BuildMorganFP
simMetric=DataStructs.DiceSimilarity
dbName = os.path.join(options.dbDir,options.morganFpDbName)
fpTableName = options.morganFpTableName
fpColName = options.morganFpColName
extraArgs={}
if options.similarityMetric=='tanimoto':
simMetric = DataStructs.TanimotoSimilarity
elif options.similarityMetric=='dice':
simMetric = DataStructs.DiceSimilarity
elif options.similarityMetric=='tversky':
simMetric = DataStructs.TverskySimilarity
extraArgs['tverskyA']=options.tverskyA
extraArgs['tverskyB']=options.tverskyB
if options.smilesQuery:
mol=Chem.MolFromSmiles(options.smilesQuery)
if not mol:
logger.error('could not build query molecule from smiles "%s"'%options.smilesQuery)
sys.exit(-1)
options.queryMol = mol
elif options.smartsQuery:
mol=Chem.MolFromSmarts(options.smartsQuery)
if not mol:
logger.error('could not build query molecule from smarts "%s"'%options.smartsQuery)
sys.exit(-1)
options.queryMol = mol
if options.outF=='-':
outF=sys.stdout
elif options.outF=='':
outF=None
else:
outF = open(options.outF,'w+')
molsOut=False
if options.sdfOut:
molsOut=True
if options.sdfOut=='-':
sdfOut=sys.stdout
else:
sdfOut = open(options.sdfOut,'w+')
else:
sdfOut=None
if options.smilesOut:
molsOut=True
if options.smilesOut=='-':
smilesOut=sys.stdout
else:
smilesOut = open(options.smilesOut,'w+')
else:
smilesOut=None
if queryFilename:
try:
tmpF = open(queryFilename,'r')
except IOError:
logger.error('could not open query file %s'%queryFilename)
sys.exit(1)
if options.molFormat=='smiles':
func=GetMolsFromSmilesFile
elif options.molFormat=='sdf':
func=GetMolsFromSDFile
if not options.silent:
msg='Reading query molecules'
if fpBuilder: msg+=' and generating fingerprints'
logger.info(msg)
probes=[]
i=0
nms=[]
for nm,smi,mol in func(queryFilename,None,options.nameProp):
i+=1
nms.append(nm)
if not mol:
logger.error('query molecule %d could not be built'%(i))
probes.append((None,None))
continue
if fpBuilder:
probes.append((mol,fpBuilder(mol)))
else:
probes.append((mol,None))
if not options.silent and not i%1000:
logger.info(" done %d"%i)
else:
probes=None
conn=None
idName = options.molIdName
ids=None
names=None
molDbName = os.path.join(options.dbDir,options.molDbName)
molIdName = options.molIdName
mConn = DbConnect(molDbName)
cns = [(x.lower(),y) for x,y in mConn.GetColumnNamesAndTypes('molecules')]
idCol,idTyp=cns[0]
if options.propQuery or options.queryMol:
conn = DbConnect(molDbName)
curs = conn.GetCursor()
if options.queryMol:
if not options.silent: logger.info('Doing substructure query')
if options.propQuery:
where='where %s'%options.propQuery
else:
where=''
if not options.silent:
curs.execute('select count(*) from molecules %(where)s'%locals())
nToDo = curs.fetchone()[0]
join=''
doSubstructFPs=False
fpDbName = os.path.join(options.dbDir,options.fpDbName)
if os.path.exists(fpDbName) and not options.negateQuery :
curs.execute("attach database '%s' as fpdb"%(fpDbName))
try:
curs.execute('select * from fpdb.%s limit 1'%options.layeredTableName)
except:
pass
else:
doSubstructFPs=True
join = 'join fpdb.%s using (%s)'%(options.layeredTableName,idCol)
query = LayeredOptions.GetQueryText(options.queryMol)
if query:
if not where:
where='where'
else:
where += ' and'
where += ' '+query
cmd = 'select %(idCol)s,molpkl from molecules %(join)s %(where)s'%locals()
curs.execute(cmd)
row=curs.fetchone()
nDone=0
ids=[]
while row:
id,molpkl = row
if not options.zipMols:
m = _molFromPkl(molpkl)
else:
m = Chem.Mol(zlib.decompress(molpkl))
matched=m.HasSubstructMatch(options.queryMol)
if options.negateQuery:
matched = not matched
if matched:
ids.append(id)
nDone+=1
if not options.silent and not nDone%500:
if not doSubstructFPs:
logger.info(' searched %d (of %d) molecules; %d hits so far'%(nDone,nToDo,len(ids)))
else:
logger.info(' searched through %d molecules; %d hits so far'%(nDone,len(ids)))
row=curs.fetchone()
if not options.silent and doSubstructFPs and nToDo:
nFiltered = nToDo-nDone
logger.info(' Fingerprint screenout rate: %d of %d (%%%.2f)'%(nFiltered,nToDo,100.*nFiltered/nToDo))
elif options.propQuery:
if not options.silent: logger.info('Doing property query')
propQuery=options.propQuery.split(';')[0]
curs.execute('select %(idCol)s from molecules where %(propQuery)s'%locals())
ids = [x[0] for x in curs.fetchall()]
if not options.silent:
logger.info('Found %d molecules matching the query'%(len(ids)))
t1=time.time()
if probes:
if not options.silent: logger.info('Finding Neighbors')
conn = DbConnect(dbName)
cns = conn.GetColumnNames(fpTableName)
curs = conn.GetCursor()
if ids:
ids = [(x,) for x in ids]
curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)'%locals())
curs.executemany('insert into _tmpTbl values (?)',ids)
join='join _tmpTbl using (%(idCol)s)'%locals()
else:
join=''
if cns[0].lower() != idCol.lower():
# backwards compatibility to the days when mol tables had a guid and
# the fps tables did not:
curs.execute("attach database '%(molDbName)s' as mols"%locals())
curs.execute("""
select %(idCol)s,%(fpColName)s from %(fpTableName)s join
(select %(idCol)s,%(molIdName)s from mols.molecules %(join)s)
using (%(molIdName)s)
"""%(locals()))
else:
curs.execute('select %(idCol)s,%(fpColName)s from %(fpTableName)s %(join)s'%locals())
def poolFromCurs(curs,similarityMethod):
row = curs.fetchone()
while row:
id,pkl = row
fp = DepickleFP(pkl,similarityMethod)
yield (id,fp)
row = curs.fetchone()
topNLists = GetNeighborLists(probes,options.topN,poolFromCurs(curs,options.similarityType),
simMetric=simMetric,simThresh=options.simThresh,**extraArgs)
uniqIds=set()
nbrLists = {}
for i,nm in enumerate(nms):
topNLists[i].reverse()
scores=topNLists[i].GetPts()
nbrNames = topNLists[i].GetExtras()
nbrs = []
for j,nbrGuid in enumerate(nbrNames):
if nbrGuid is None:
break
else:
uniqIds.add(nbrGuid)
nbrs.append((nbrGuid,scores[j]))
nbrLists[(i,nm)] = nbrs
t2=time.time()
if not options.silent: logger.info('The search took %.1f seconds'%(t2-t1))
if not options.silent: logger.info('Creating output')
curs = mConn.GetCursor()
ids = list(uniqIds)
ids = [(x,) for x in ids]
curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)'%locals())
curs.executemany('insert into _tmpTbl values (?)',ids)
curs.execute('select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'%locals())
nmDict={}
for guid,id in curs.fetchall():
nmDict[guid]=str(id)
ks = list(nbrLists.keys())
ks.sort()
if not options.transpose:
for i,nm in ks:
nbrs= nbrLists[(i,nm)]
nbrTxt=options.outputDelim.join([nm]+['%s%s%.3f'%(nmDict[id],options.outputDelim,score) for id,score in nbrs])
if outF: print(nbrTxt,file=outF)
else:
labels = ['%s%sSimilarity'%(x[1],options.outputDelim) for x in ks]
if outF: print(options.outputDelim.join(labels),file=outF)
for i in range(options.topN):
outL = []
for idx,nm in ks:
nbr = nbrLists[(idx,nm)][i]
outL.append(nmDict[nbr[0]])
outL.append('%.3f'%nbr[1])
if outF: print(options.outputDelim.join(outL),file=outF)
else:
if not options.silent: logger.info('Creating output')
curs = mConn.GetCursor()
ids = [(x,) for x in set(ids)]
curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)'%locals())
curs.executemany('insert into _tmpTbl values (?)',ids)
molIdName = options.molIdName
curs.execute('select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'%locals())
nmDict={}
for guid,id in curs.fetchall():
nmDict[guid]=str(id)
if outF: print('\n'.join(nmDict.values()),file=outF)
if molsOut and ids:
molDbName = os.path.join(options.dbDir,options.molDbName)
cns = [x.lower() for x in mConn.GetColumnNames('molecules')]
if cns[-1]!='molpkl':
cns.remove('molpkl')
cns.append('molpkl')
curs = mConn.GetCursor()
#curs.execute('create temporary table _tmpTbl (guid integer)'%locals())
#curs.executemany('insert into _tmpTbl values (?)',ids)
cnText=','.join(cns)
curs.execute('select %(cnText)s from molecules join _tmpTbl using (%(idCol)s)'%locals())
row=curs.fetchone()
molD = {}
while row:
row = list(row)
m = _molFromPkl(row[-1])
guid = row[0]
nm = nmDict[guid]
if sdfOut:
m.SetProp('_Name',nm)
print(Chem.MolToMolBlock(m),file=sdfOut)
for i in range(1,len(cns)-1):
pn = cns[i]
pv = str(row[i])
print >>sdfOut,'> <%s>\n%s\n'%(pn,pv)
print('$$$$',file=sdfOut)
if smilesOut:
smi=Chem.MolToSmiles(m,options.chiralSmiles)
if smilesOut:
print('%s %s'%(smi,str(row[1])),file=smilesOut)
row=curs.fetchone()
if not options.silent: logger.info('Done!')
# ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
import os
from optparse import OptionParser
parser=OptionParser(_usage,version='%prog '+_version)
parser.add_option('--dbDir',default='/db/camm/CURRENT/rdk_db',
help='name of the directory containing the database information. The default is %default')
parser.add_option('--molDbName',default='Compounds.sqlt',
help='name of the molecule database')
parser.add_option('--molIdName',default='compound_id',
help='name of the database key column')
parser.add_option('--regName',default='molecules',
help='name of the molecular registry table')
parser.add_option('--pairDbName',default='AtomPairs.sqlt',
help='name of the atom pairs database')
parser.add_option('--pairTableName',default='atompairs',
help='name of the atom pairs table')
parser.add_option('--pairColName',default='atompairfp',
help='name of the atom pair column')
parser.add_option('--torsionsDbName',default='AtomPairs.sqlt',
help='name of the topological torsions database (usually the same as the atom pairs database)')
parser.add_option('--torsionsTableName',default='atompairs',
help='name of the topological torsions table (usually the same as the atom pairs table)')
parser.add_option('--torsionsColName',default='torsionfp',
help='name of the atom pair column')
parser.add_option('--fpDbName',default='Fingerprints.sqlt',
help='name of the 2D fingerprints database')
parser.add_option('--fpTableName',default='rdkitfps',
help='name of the 2D fingerprints table')
parser.add_option('--layeredTableName',default='layeredfps',
help='name of the layered fingerprints table')
parser.add_option('--fpColName',default='',
help='name of the 2D fingerprint column, a sensible default is used')
parser.add_option('--descrDbName',default='Descriptors.sqlt',
help='name of the descriptor database')
parser.add_option('--descrTableName',default='descriptors_v1',
help='name of the descriptor table')
parser.add_option('--descriptorCalcFilename',default=os.path.join(RDConfig.RDBaseDir,'Projects',
'DbCLI','moe_like.dsc'),
help='name of the file containing the descriptor calculator')
parser.add_option('--outputDelim',default=',',
help='the delimiter for the output file. The default is %default')
parser.add_option('--topN',default=20,type='int',
help='the number of neighbors to keep for each query compound. The default is %default')
parser.add_option('--outF','--outFile',default='-',
help='The name of the output file. The default is the console (stdout).')
parser.add_option('--transpose',default=False,action="store_true",
help='print the results out in a transposed form: e.g. neighbors in rows and probe compounds in columns')
parser.add_option('--molFormat',default='sdf',choices=('smiles','sdf'),
help='specify the format of the input file')
parser.add_option('--nameProp',default='_Name',
help='specify the SD property to be used for the molecule names. Default is to use the mol block name')
parser.add_option('--smartsQuery','--smarts','--sma',default='',
help='provide a SMARTS to be used as a substructure query')
parser.add_option('--smilesQuery','--smiles','--smi',default='',
help='provide a SMILES to be used as a substructure query')
parser.add_option('--negateQuery','--negate',default=False,action='store_true',
help='negate the results of the smarts query.')
parser.add_option('--propQuery','--query','-q',default='',
help='provide a property query (see the NOTE about property names)')
parser.add_option('--sdfOut','--sdOut',default='',
help='export an SD file with the matching molecules')
parser.add_option('--smilesOut','--smiOut',default='',
help='export a smiles file with the matching molecules')
parser.add_option('--nonchiralSmiles',dest='chiralSmiles',default=True,action='store_false',
help='do not use chiral SMILES in the output')
parser.add_option('--silent',default=False,action='store_true',
help='Do not generate status messages.')
parser.add_option('--zipMols','--zip',default=False,action='store_true',
help='read compressed mols from the database')
parser.add_option('--pharm2DTableName',default='pharm2dfps',
help='name of the Pharm2D fingerprints table')
parser.add_option('--fdefFile','--fdef',
default=os.path.join(RDConfig.RDDataDir,'Novartis1.fdef'),
help='provide the name of the fdef file to use for 2d pharmacophores')
parser.add_option('--gobbi2DTableName',default='gobbi2dfps',
help='name of the Gobbi2D fingerprints table')
parser.add_option('--similarityType','--simType','--sim',
default='RDK',choices=supportedSimilarityMethods,
help='Choose the type of similarity to use, possible values: RDK, AtomPairs, TopologicalTorsions, Pharm2D, Gobbi2D, Avalon, Morgan. The default is %default')
parser.add_option('--morganFpDbName',default='Fingerprints.sqlt',
help='name of the morgan fingerprints database')
parser.add_option('--morganFpTableName',default='morganfps',
help='name of the morgan fingerprints table')
parser.add_option('--morganFpColName',default='morganfp',
help='name of the morgan fingerprint column')
parser.add_option('--similarityMetric','--simMetric','--metric',
default='',choices=('tanimoto','dice','tversky',''),
help='Choose the type of similarity to use, possible values: tanimoto, dice, tversky. The default is determined by the fingerprint type')
parser.add_option('--tverskyA',default=0.5,type='float',
help='Tversky A value')
parser.add_option('--tverskyB',default=0.5,type='float',
help='Tversky B value')
parser.add_option('--simThresh',default=-1,type='float',
help='threshold to use for similarity searching. If provided, this supersedes the topN argument')
if __name__=='__main__':
import sys,getopt,time
options,args = parser.parse_args()
if len(args)!=1 and not (options.smilesQuery or options.smartsQuery or options.propQuery):
parser.error('please either provide a query filename argument or do a data or smarts query')
if len(args):
queryFilename=args[0]
else:
queryFilename=None
options.queryMol=None
RunSearch(options,queryFilename)