mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
482 lines
14 KiB
Python
Executable File
482 lines
14 KiB
Python
Executable File
#
|
|
# Copyright (C) 2001,2002 greg Landrum and Rational Discovery LLC
|
|
#
|
|
""" descriptor calculator for compounds defined by a composition alone
|
|
(only the composition is required)
|
|
|
|
"""
|
|
import RDConfig
|
|
from utils import chemutils
|
|
import os
|
|
from Dbase.DbConnection import DbConnect
|
|
from ML.Descriptors import Parser,Descriptors
|
|
|
|
import string
|
|
|
|
# the list of possible ways to count valence electrons that we know
|
|
countOptions = [('NVAL','total number of valence electrons'),
|
|
('NVAL_NO_FULL_F','number of valence electrons neglecting filled f shells'),
|
|
('NVAL_NO_FULL_D','number of valence electrons neglecting filled d shells'),
|
|
('NVAL_NO_FULL','number of valence electrons neglecting filled f and d shells')]
|
|
|
|
def GetAllDescriptorNames(db,tbl1,tbl2,user='sysdba',password='masterkey'):
|
|
""" gets possible descriptor names from a database
|
|
|
|
**Arguments**
|
|
|
|
- db: the name of the database to use
|
|
|
|
- tbl1: the name of the table to be used for reading descriptor values
|
|
|
|
- tbl2: the name of the table to be used for reading notes about the
|
|
descriptors (*descriptions of the descriptors if you like*)
|
|
|
|
- user: the user name for DB access
|
|
|
|
- password: the password for DB access
|
|
|
|
**Returns**
|
|
|
|
a 2-tuple containing:
|
|
|
|
1) a list of column names
|
|
|
|
2) a list of column descriptors
|
|
|
|
**Notes**
|
|
|
|
- this uses _Dbase.DbInfo_ and Dfunctionality for querying the database
|
|
|
|
- it is assumed that tbl2 includes 'property' and 'notes' columns
|
|
|
|
"""
|
|
conn = DbConnect(db,user=user,password=password)
|
|
|
|
colNames = conn.GetColumnNames(table=tbl1)
|
|
colDesc = map(lambda x:(string.upper(x[0]),x[1]),
|
|
conn.GetColumns('property,notes',table=tbl2))
|
|
for name,desc in countOptions:
|
|
colNames.append(name)
|
|
colDesc.append((name,desc))
|
|
return colNames,colDesc
|
|
|
|
class CompoundDescriptorCalculator(Descriptors.DescriptorCalculator):
|
|
""" used for calculating descriptors
|
|
|
|
This is the central point for descriptor calculation
|
|
|
|
**Notes**
|
|
|
|
- There are two kinds of descriptors this cares about:
|
|
|
|
1) *Simple Descriptors* can be calculated solely using atomic descriptor
|
|
values and the composition of the compound. The full list of possible
|
|
simple descriptors is determined by the types of *Calculator Methods*
|
|
(see below) and the contents of an atomic database.
|
|
|
|
Simple Descriptors can be marked as *nonZeroDescriptors*. These are used
|
|
to winnow out atom types where particular atomic descriptors are zero
|
|
(usually indicating that the value is unknown)
|
|
|
|
Simple Descriptors are maintained locally in the _simpleList_
|
|
|
|
2) *Compound Descriptors* may rely upon more complicated computation schemes
|
|
and descriptors for the compound as a whole (e.g. structural variables, etc.).
|
|
The full list of compound descriptors is limitless. They are calculated using
|
|
the _ML.Descriptors.Parser_ module.
|
|
|
|
Compound Descriptors are maintained locally in the _compoundList_
|
|
|
|
- This class has a some special methods which are labelled as *Calculator Method*
|
|
These are used internally to take atomic descriptors and reduce them to a single
|
|
simple descriptor value for a composition. They are primarily intended for internal use.
|
|
|
|
- a *composition vector* is a list of 2-tuples: '[(atom1name,atom1Num),...]'
|
|
where atom1Num is the contribution of the atom to the stoichiometry of the
|
|
compound. No assumption is made about the stoichiometries (i.e. they don't
|
|
have to be either integral or all sum to one).
|
|
|
|
"""
|
|
|
|
#------------
|
|
# methods used to calculate descriptors
|
|
#------------
|
|
|
|
def SUM(self,desc,compos):
|
|
""" *Calculator Method*
|
|
|
|
sums the descriptor values across the composition
|
|
|
|
**Arguments**
|
|
|
|
- desc: the name of the descriptor
|
|
|
|
- compos: the composition vector
|
|
|
|
**Returns**
|
|
|
|
a float
|
|
|
|
"""
|
|
res = 0.0
|
|
for atom,num in compos:
|
|
res = res + self.atomDict[atom][desc]*num
|
|
return res
|
|
def MEAN(self,desc,compos):
|
|
""" *Calculator Method*
|
|
|
|
averages the descriptor values across the composition
|
|
|
|
**Arguments**
|
|
|
|
- desc: the name of the descriptor
|
|
|
|
- compos: the composition vector
|
|
|
|
**Returns**
|
|
|
|
a float
|
|
|
|
"""
|
|
res = 0.0
|
|
nSoFar = 0.0
|
|
for atom,num in compos:
|
|
res = res + self.atomDict[atom][desc]*num
|
|
nSoFar = nSoFar + num
|
|
return res/nSoFar
|
|
def DEV(self,desc,compos):
|
|
""" *Calculator Method*
|
|
|
|
average deviation of the descriptor values across the composition
|
|
|
|
**Arguments**
|
|
|
|
- desc: the name of the descriptor
|
|
|
|
- compos: the composition vector
|
|
|
|
**Returns**
|
|
|
|
a float
|
|
|
|
"""
|
|
mean = self.MEAN(desc,compos)
|
|
res = 0.0
|
|
nSoFar = 0.0
|
|
for atom,num in compos:
|
|
res = res + abs(self.atomDict[atom][desc]-mean)*num
|
|
nSoFar = nSoFar + num
|
|
return res/nSoFar
|
|
def MIN(self,desc,compos):
|
|
""" *Calculator Method*
|
|
|
|
minimum of the descriptor values across the composition
|
|
|
|
**Arguments**
|
|
|
|
- desc: the name of the descriptor
|
|
|
|
- compos: the composition vector
|
|
|
|
**Returns**
|
|
|
|
a float
|
|
|
|
"""
|
|
return min(map(lambda x,y=desc,z=self:z.atomDict[x[0]][y],compos))
|
|
def MAX(self,desc,compos):
|
|
""" *Calculator Method*
|
|
|
|
maximum of the descriptor values across the composition
|
|
|
|
**Arguments**
|
|
|
|
- desc: the name of the descriptor
|
|
|
|
- compos: the composition vector
|
|
|
|
**Returns**
|
|
|
|
a float
|
|
|
|
"""
|
|
return max(map(lambda x,y=desc,z=self:z.atomDict[x[0]][y],compos))
|
|
|
|
#------------
|
|
# Other methods
|
|
#------------
|
|
|
|
def ProcessSimpleList(self):
|
|
""" Handles the list of simple descriptors
|
|
|
|
This constructs the list of _nonZeroDescriptors_ and _requiredDescriptors_.
|
|
|
|
There's some other magic going on that I can't decipher at the moment.
|
|
|
|
"""
|
|
global countOptions
|
|
|
|
self.nonZeroDescriptors = []
|
|
lCopy = self.simpleList[:]
|
|
tList = map(lambda x:x[0],countOptions)
|
|
for i in xrange(len(lCopy)):
|
|
entry = lCopy[i]
|
|
if 'NONZERO' in entry[1]:
|
|
if entry[0] not in tList:
|
|
self.nonZeroDescriptors.append('%s != 0'%entry[0])
|
|
if len(entry[1]) == 1:
|
|
self.simpleList.remove(entry)
|
|
else:
|
|
self.simpleList[self.simpleList.index(entry)][1].remove('NONZERO')
|
|
self.requiredDescriptors = map(lambda x:x[0],self.simpleList)
|
|
for entry in tList:
|
|
if entry in self.requiredDescriptors:
|
|
self.requiredDescriptors.remove(entry)
|
|
|
|
def ProcessCompoundList(self):
|
|
""" Adds entries from the _compoundList_ to the list of _requiredDescriptors_
|
|
|
|
Each compound descriptor is surveyed. Any atomic descriptors it requires
|
|
are added to the list of _requiredDescriptors_ to be pulled from the database.
|
|
|
|
"""
|
|
# add in the atomic descriptors we will need
|
|
for entry in self.compoundList:
|
|
for atomicDesc in entry[1]:
|
|
if atomicDesc != '' and atomicDesc not in self.requiredDescriptors:
|
|
self.requiredDescriptors.append(atomicDesc)
|
|
|
|
def BuildAtomDict(self):
|
|
""" builds the local atomic dict
|
|
|
|
We don't want to keep around all descriptor values for all atoms, so this
|
|
method takes care of only pulling out the descriptors in which we are
|
|
interested.
|
|
|
|
**Notes**
|
|
|
|
- this uses _chemutils.GetAtomicData_ to actually pull the data
|
|
|
|
"""
|
|
self.ProcessSimpleList()
|
|
self.ProcessCompoundList()
|
|
|
|
self.atomDict = {}
|
|
whereString = string.join(self.nonZeroDescriptors,' and ')
|
|
if whereString != '':
|
|
whereString = 'where ' + whereString
|
|
chemutils.GetAtomicData(self.atomDict,self.requiredDescriptors,self.dbName,self.dbTable,
|
|
whereString,self.dbUser,self.dbPassword,
|
|
includeElCounts=1)
|
|
|
|
def CalcSimpleDescriptorsForComposition(self,compos='',composList=None):
|
|
""" calculates all simple descriptors for a given composition
|
|
|
|
**Arguments**
|
|
|
|
- compos: a string representation of the composition
|
|
|
|
- composList: a *composVect*
|
|
|
|
The client must provide either _compos_ or _composList_. If both are
|
|
provided, _composList_ takes priority.
|
|
|
|
**Returns**
|
|
the list of descriptor values
|
|
|
|
**Notes**
|
|
|
|
- when _compos_ is provided, this uses _chemutils.SplitComposition_
|
|
to split the composition into its individual pieces
|
|
|
|
- if problems are encountered because of either an unknown descriptor or
|
|
atom type, a _KeyError_ will be raised.
|
|
|
|
"""
|
|
if composList is None:
|
|
composList = chemutils.SplitComposition(compos)
|
|
try:
|
|
res = []
|
|
for i in xrange(len(self.simpleList)):
|
|
descName,targets = self.simpleList[i]
|
|
for target in targets:
|
|
try:
|
|
method = getattr(self,target)
|
|
except AttributeError:
|
|
print 'Method %s does not exist'%(target)
|
|
else:
|
|
res.append(method(descName,composList))
|
|
except KeyError,msg:
|
|
print 'composition %s caused problems'%composList
|
|
raise KeyError,msg
|
|
return res
|
|
|
|
def CalcCompoundDescriptorsForComposition(self,compos='',composList=None,
|
|
propDict={}):
|
|
""" calculates all simple descriptors for a given composition
|
|
|
|
**Arguments**
|
|
|
|
- compos: a string representation of the composition
|
|
|
|
- composList: a *composVect*
|
|
|
|
- propDict: a dictionary containing the properties of the composition
|
|
as a whole (e.g. structural variables, etc.)
|
|
|
|
The client must provide either _compos_ or _composList_. If both are
|
|
provided, _composList_ takes priority.
|
|
|
|
**Returns**
|
|
the list of descriptor values
|
|
|
|
**Notes**
|
|
|
|
- when _compos_ is provided, this uses _chemutils.SplitComposition_
|
|
to split the composition into its individual pieces
|
|
|
|
"""
|
|
if composList is None:
|
|
composList = chemutils.SplitComposition(compos)
|
|
res = []
|
|
for i in xrange(len(self.compoundList)):
|
|
val = Parser.CalcSingleCompoundDescriptor(composList,self.compoundList[i][1:],
|
|
self.atomDict,propDict)
|
|
res.append(val)
|
|
return res
|
|
|
|
def CalcDescriptorsForComposition(self,composVect,propDict):
|
|
""" calculates all descriptors for a given composition
|
|
|
|
**Arguments**
|
|
|
|
- compos: a string representation of the composition
|
|
|
|
- propDict: a dictionary containing the properties of the composition
|
|
as a whole (e.g. structural variables, etc.). These are used to
|
|
generate Compound Descriptors
|
|
|
|
**Returns**
|
|
the list of all descriptor values
|
|
|
|
**Notes**
|
|
|
|
- this uses _chemutils.SplitComposition_
|
|
to split the composition into its individual pieces
|
|
|
|
"""
|
|
composList = chemutils.SplitComposition(composVect[0])
|
|
try:
|
|
r1 = self.CalcSimpleDescriptorsForComposition(composList=composList)
|
|
except KeyError,msg:
|
|
res = []
|
|
else:
|
|
r2 = self.CalcCompoundDescriptorsForComposition(composList=composList,
|
|
propDict=propDict)
|
|
res = r1+r2
|
|
|
|
return tuple(res)
|
|
CalcDescriptors = CalcDescriptorsForComposition
|
|
|
|
def GetDescriptorNames(self):
|
|
""" returns a list of the names of the descriptors this calculator generates
|
|
|
|
"""
|
|
if self.descriptorNames is not None:
|
|
return self.descriptorNames
|
|
else:
|
|
res = []
|
|
for i in xrange(len(self.simpleList)):
|
|
descName,targets = self.simpleList[i]
|
|
for target in targets:
|
|
try:
|
|
method = getattr(self,target)
|
|
except AttributeError:
|
|
print 'Method %s does not exist'%(target)
|
|
else:
|
|
res.append('%s_%s'%(target,descName))
|
|
for entry in self.compoundList:
|
|
res.append(entry[0])
|
|
self.descriptorNames = res[:]
|
|
return tuple(res)
|
|
|
|
def __init__(self,simpleList,compoundList=None,
|
|
dbName=None,
|
|
dbTable='atomic_data',dbUser='sysdba',dbPassword='masterkey'):
|
|
""" Constructor
|
|
|
|
**Arguments**
|
|
|
|
- simpleList: list of simple descriptors to be calculated
|
|
(see below for format)
|
|
|
|
- compoundList: list of compound descriptors to be calculated
|
|
(see below for format)
|
|
|
|
- dbName: name of the atomic database to be used
|
|
|
|
- dbTable: name the table in _dbName_ which has atomic data
|
|
|
|
- dbUser: user name for DB access
|
|
|
|
- dbPassword: password for DB access
|
|
|
|
**Note**
|
|
|
|
- format of simpleList:
|
|
a list of 2-tuples containing:
|
|
|
|
1) name of the atomic descriptor
|
|
|
|
2) a list of operations on that descriptor (e.g. NonZero, Max, etc.)
|
|
These must correspond to the *Calculator Method* names above.
|
|
|
|
- format of compoundList:
|
|
a list of 2-tuples containing:
|
|
|
|
1) name of the descriptor to be calculated
|
|
|
|
2) list of selected atomic descriptor names (define $1, $2, etc.)
|
|
|
|
3) list of selected compound descriptor names (define $a, $b, etc.)
|
|
|
|
4) text formula defining the calculation (see _Parser_)
|
|
|
|
"""
|
|
|
|
if dbName is None:
|
|
if not RDConfig.usePgSQL:
|
|
dbName = os.path.join(RDConfig.RDDataDir,'atomdb.gdb')
|
|
else:
|
|
dbName = "::RDData"
|
|
|
|
|
|
Descriptors.DescriptorCalculator.__init__(self)
|
|
self.simpleList = map(lambda x:(string.upper(x[0]),map(string.upper,x[1])),
|
|
simpleList)
|
|
self.descriptorNames = None
|
|
self.compoundList = compoundList
|
|
if self.compoundList is None:
|
|
self.compoundList = []
|
|
self.dbName = dbName
|
|
self.dbTable = dbTable
|
|
self.dbUser = dbUser
|
|
self.dbPassword = dbPassword
|
|
|
|
|
|
if __name__ == '__main__':
|
|
d = [('DED',['NonZero','Mean','Dev']),
|
|
('M_B_electroneg',['NonZero']),
|
|
('Cov_rad',['Max','Min'])]
|
|
o = DescriptorCalculator(d)
|
|
o.BuildAtomDict()
|
|
print 'len:',len(o.atomDict.keys())
|
|
for key in o.atomDict.keys()[-4:-1]:
|
|
print key,o.atomDict[key]
|
|
|
|
print 'descriptors:',o.GetDescriptorNames()
|
|
composList = ['Nb','Nb3','NbPt','Nb2Pt']
|
|
for compos in composList:
|
|
descs = o.CalcSimpleDescriptorsForComposition(compos)
|
|
print compos,descs
|