add sa score

This commit is contained in:
Greg Landrum
2013-10-02 04:55:34 +02:00
parent ff85e5b5e6
commit 740da0d0b9
5 changed files with 327 additions and 0 deletions

9
Contrib/SA_Score/README Normal file
View File

@@ -0,0 +1,9 @@
RDKit-based implementation of the method described in:
Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
Peter Ertl and Ansgar Schuffenhauer
Journal of Cheminformatics 1:8 (2009)
http://www.jcheminf.com/content/1/1/8
Contribution from Peter Ertl and Greg Landrum

View File

@@ -0,0 +1,34 @@
from rdkit import RDConfig
from rdkit import Chem
import unittest,os.path
import sascorer
print sascorer.__file__
class TestCase(unittest.TestCase):
def test1(self):
testData = [x.strip().split('\t') for x in file('data/zim.100.txt').readlines()]
testData.pop(0)
for row in testData:
smi = row[0]
m = Chem.MolFromSmiles(smi)
tgt = float(row[2])
val = sascorer.calculateScore(m)
self.failUnlessAlmostEqual(tgt,val,3)
if __name__ == '__main__':
import sys,getopt,re
doLong=0
if len(sys.argv) >1:
args,extras=getopt.getopt(sys.argv[1:],'l')
for arg,val in args:
if arg=='-l':
doLong=1
sys.argv.remove('-l')
if doLong:
for methName in dir(TestCase):
if re.match('_test',methName):
newName = re.sub('_test','test',methName)
exec('TestCase.%s = TestCase.%s'%(newName,methName))
unittest.main()

View File

@@ -0,0 +1,101 @@
smiles Name sa_score
Cc1c(C(=O)NCCO)[n+](=O)c2ccccc2n1[O-] ZINC21984717 3.166
Cn1cc(NC=O)cc1C(=O)Nc1cc(C(=O)Nc2cc(C(=O)NCCC(N)=[NH2+])n(C)c2)n(C)c1 ZINC03872327 3.328
OC(c1ccncc1)c1ccc(OCC[NH+]2CCCC2)cc1 ZINC34421620 3.822
CC(C(=O)[O-])c1ccc(-c2ccccc2)cc1 ZINC00000361 2.462
C[NH+](C)CC(O)Cn1c2ccc(Br)cc2c2cc(Br)ccc21 ZINC00626529 3.577
NC(=[NH2+])NCC1COc2ccccc2O1 ZINC00000357 3.290
CCC(C)(C)[NH2+]CC(O)COc1ccccc1C#N ZINC04214111 3.698
C[NH+](C)CC(O)Cn1c2ccc(Br)cc2c2cc(Br)ccc21 ZINC00626528 3.577
CC12CCC3C(CCC4CC(=O)CCC43C)C1CCC2=O ZINC04081985 3.912
COc1ccc(OC(=O)N(CC(=O)[O-])Cc2ccc(OCCc3nc(-c4ccccc4)oc3C)cc2)cc1 ZINC03935839 2.644
COc1ccccc1OC(=O)c1ccccc1 ZINC00000349 1.342
CC(C)CC[NH2+]CC1COc2ccccc2O1 ZINC04214115 3.701
CN1CCN(C(=O)OC2c3nccnc3C(=O)N2c2ccc(Cl)cn2)CC1 ZINC19632834 3.196
CCC1(c2ccccc2)C(=O)N(COC)C(=O)N(COC)C1=O ZINC02986592 2.759
Nc1ccc(S(=O)(=O)Nc2ccccc2)cc1 ZINC00141883 1.529
O=C([O-])CCCNC(=O)NC1CCCCC1 ZINC08754389 2.493
CCC(C)C(C(=O)OC1CC[N+](C)(C)CC1)c1ccccc1 ZINC00000595 3.399
CCC(C)SSc1ncc[nH]1 ZINC13209429 3.983
CC[N+](C)(CC)CCOC(=O)C(O)(c1cccs1)C1CCCC1 ZINC01690860 3.471
CC12CCC3C(CCC4CC(=O)CCC43C)C1CCC2O ZINC03814360 3.994
CC12CCC3C4CCC(=O)C=C4CCC3C1CCC2O ZINC03814379 4.056
OCC1OC(OC2C(CO)OC(O)C(O)C2O)C(O)C(O)C1O ZINC04095762 4.282
CC(C)CC(CC[NH+](C(C)C)C(C)C)(C(N)=O)c1ccccn1 ZINC02016048 4.092
C=CC1(C)CC(=O)C2(O)C(C)(O1)C(OC(C)=O)C(OC(=O)CC[NH+](C)C)C1C(C)(C)CCC(O)C12C ZINC38595287 5.519
C=CC[NH+]1CCCC1CNC(=O)c1cc(S(N)(=O)=O)cc(OC)c1OC ZINC00601278 4.286
CC(=O)OC1C[NH+]2CCC1CC2 ZINC00492792 5.711
CC12CCC3C(CCC4CC(=O)CCC43C)C1CCC2O ZINC03814418 3.994
CC1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3CCC21C ZINC03814422 4.022
CC(=O)OC1(C(C)=O)CCC2C3C=C(Cl)C4=CC(=O)C5CC5C4(C)C3CCC21C ZINC03814423 4.827
C#CC1(O)CCC2C3CCc4cc(OC)ccc4C3CCC21C ZINC03815424 3.810
C=CC1(C)CC(OC(=O)CSCC[NH+](CC)CC)C2(C)C3C(=O)CCC3(CCC2C)C(C)C1O ZINC25757051 6.200
O=C([O-])C(=O)Nc1nc(-c2ccc3c(c2)OCCO3)cs1 ZINC03623428 2.594
CC[NH+]1CCCC1CNC(=O)C(O)(c1ccccc1)c1ccccc1 ZINC00900569 3.950
CC(C)(OCc1nn(Cc2ccccc2)c2ccccc12)C(=O)[O-] ZINC00004594 2.573
Cc1nnc(C(C)C)n1C1CC2CCC(C1)[NH+]2CCC(NC(=O)C1CCC(F)(F)CC1)c1ccccc1 ZINC03817234 5.316
Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O ZINC03871612 5.290
O=C([O-])CNC(=O)c1ccccc1 ZINC00097685 2.097
Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O ZINC03871613 5.290
Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O ZINC03871614 5.290
c1ccc(OCc2ccc(CCCN3CCOCC3)cc2)cc1 ZINC19865692 1.702
CC=CC1=C(C(=O)[O-])N2C(=O)C(NC(=O)C(N)c3ccc(O)cc3)C2SC1 ZINC20444132 4.042
C[NH+]1CCCC1COc1cccnc1 ZINC03805141 4.510
O=C([O-])C(O)CC(O)C(O)CO ZINC04803503 4.398
O=C([O-])C(O)CC(O)C(O)CO ZINC01696607 4.398
C[NH+]1CCCC1Cc1c[nH]c2ccc(CCS(=O)(=O)c3ccccc3)cc12 ZINC03823475 3.921
C(=Cc1ccccc1)C[NH+]1CCN(C(c2ccccc2)c2ccccc2)CC1 ZINC19632891 2.973
Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O ZINC03871615 5.290
CC(c1ccccc1)N(C)C=O ZINC06932229 2.562
CC(=O)C1CCC2C3CCC4CC(C)(O)CCC4(C)C3CCC12C ZINC03824281 4.279
O=C([O-])C(O)CC(O)C(O)CO ZINC04803506 4.398
COc1cc(O)c(C(=O)c2ccccc2)c(O)c1 ZINC00000187 1.868
O=C([O-])C(O)CC(O)C(O)CO ZINC04803507 4.398
COc1c2c(cc3c1C(O)N(C)CC3)OCO2 ZINC00000186 3.183
CCC(C(=O)[O-])c1ccc(CC(C)C)cc1 ZINC00015537 2.827
O=C([O-])C1[NH+]=C(c2ccccc2)c2cc(Cl)ccc2NC1(O)O ZINC38611850 4.011
O=C([O-])C1[NH+]=C(c2ccccc2)c2cc(Cl)ccc2NC1(O)O ZINC38611851 4.011
OCC(O)COc1ccc(Cl)cc1 ZINC00000135 2.102
NC(=O)NC(=O)C(Cl)c1ccccc1 ZINC00000134 2.455
OC(c1ccccc1)(c1ccccc1)C1C[NH+]2CCC1CC2 ZINC01298963 4.530
C[NH2+]CC(C)c1ccccc1 ZINC04298801 3.471
Clc1cccc(Cl)c1N=C1NCCO1 ZINC13835972 3.267
[NH3+]C(Cc1ccccc1)C(=O)CCl ZINC02504633 3.251
CC(C)Cn1cnc2c1c1ccccc1nc2N ZINC19632912 2.230
CC(O)CN(C)c1ccc(NN)nn1 ZINC00000624 3.193
CC1(O)CCC2C3CCC4=CC(=O)CCC4=C3C=CC21C ZINC00001727 4.461
CCC(C(=O)[O-])c1ccc(-c2ccccc2)cc1 ZINC00000111 2.505
CC(=O)OCC1OC(n2ncc(=O)[nH]c2=O)C(OC(C)=O)C1OC(C)=O ZINC03830255 3.832
CC(=O)OCC1OC(n2ncc(=O)[nH]c2=O)C(OC(C)=O)C1OC(C)=O ZINC03830256 3.832
Cn1cc(C(=O)c2cccc3ccccc32)cc1C(=O)[O-] ZINC00001783 2.456
CC(=O)OCC1OC(n2ncc(=O)[nH]c2=O)C(OC(C)=O)C1OC(C)=O ZINC03830257 3.832
Cc1cccc(-c2nc3ccccc3c(Nc3ccc4[nH]ncc4c3)n2)n1 ZINC39279791 2.358
O=C([O-])C1CC2CCCCC2[NH2+]1 ZINC04899687 5.422
CC(=O)OCC(=O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C ZINC00538219 4.187
O=C([O-])C1CC2CCCCC2[NH2+]1 ZINC04899686 5.422
O=C(OCc1ccccc1)C(O)c1ccccc1 ZINC00000078 2.038
CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)C=CC4(C)C3C(O)CC21C ZINC00608041 4.394
Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1 ZINC02570895 2.144
COCc1cccc(CC(O)C=CC2C(O)CC(=O)C2CCSCCCC(=O)OC)c1 ZINC03940680 3.934
CCC(=O)N(c1ccccc1)C1CC[NH+](C(C)Cc2ccccc2)CC1 ZINC01664586 3.582
CCC(=O)N(c1ccccc1)C1CC[NH+](C(C)Cc2ccccc2)CC1 ZINC01664587 3.582
CCOC(=O)Nc1ccc2c(c1)N(C(=O)CCN1CCOCC1)c1ccccc1S2 ZINC19340795 2.446
O=C([O-])Cc1cc(=O)[nH]c(=O)[nH]1 ZINC00403617 3.258
NC(=O)C([NH3+])Cc1c[nH]c2ccccc12 ZINC04899521 3.224
NC(=O)C([NH3+])Cc1ccc(O)cc1 ZINC04899513 3.280
O=C(c1cc2ccccc2o1)N1CCN(Cc2ccccc2)CC1 ZINC19632922 1.799
O=C(CO)C(O)C(O)CO ZINC00902219 3.473
CC(Cc1ccccc1)NC(=O)C([NH3+])CCCC[NH3+] ZINC11680943 3.967
C[NH+]1CCC(c2c(O)cc(=O)c3c(O)cc(-c4ccccc4Cl)oc2-3)C(O)C1 ZINC05966679 4.616
CN(C)c1ccc(O)c2c1CC1CC3C([NH+](C)C)C(=O)C(C(N)=O)=C(O)C3(O)C(=O)C1=C2O ZINC04019704 4.713
Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(CC(O)C(O)C(O)CO)c2cc1C ZINC03650334 3.791
C[NH+]1C2CCC1CC(OC(=O)c1c[nH]c3ccccc13)C2 ZINC18130447 4.892
Cc1ccccc1NC(=O)C(C)[NH+]1CCCC1 ZINC00000051 3.809
O=S(=O)([O-])CCN1CCOCC1 ZINC19419111 2.776
C[NH+]1CCN(CC(=O)N2c3ccccc3C(=O)Nc3cccnc32)CC1 ZINC19632927 3.379
CCCCCC=CCC=CCCCCCCCC(=O)[O-] ZINC03802188 2.805
CC(CC([NH3+])C(=O)[O-])C(=O)[O-] ZINC01747048 5.690
CC1c2cccc(O)c2C(=O)C2=C(O)C3(O)C(O)=C(C(N)=O)C(=O)C([NH+](C)C)C3C(O)C21 ZINC04019706 5.069
Cc1cc2nc3nc([O-])[nH]c(=O)c3nc2cc1C ZINC12446789 3.079
CC1=CC(C)C2(CO)COC(c3ccc(O)cc3)C1C2C ZINC38190856 4.749
CC[NH+]1CCC(=C2c3ccccc3CCc3ccccc32)C1C ZINC02020004 3.925

Binary file not shown.

View File

@@ -0,0 +1,183 @@
#
# calculation of synthetic accessibility score as described in:
#
# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
# Peter Ertl and Ansgar Schuffenhauer
# Journal of Cheminformatics 1:8 (2009)
# http://www.jcheminf.com/content/1/1/8
#
# several small modifications to the original paper are included
# particularly slightly different formula for marocyclic penalty
# and taking into account also molecule symmetry (fingerprint density)
#
# for a set of 10k diverse molecules the agreement between the original method
# as implemented in PipelinePilot and this implementation is r2 = 0.97
#
# peter ertl & greg landrum, september 2013
#
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import math
from collections import defaultdict
_fscores = None
def readFragmentScores(name='fpscores'):
import cPickle,gzip
global _fscores
_fscores = cPickle.load(gzip.open('%s.pkl.gz'%name))
def numBridgeheadsAndSpiro(mol,ri=None):
if ri is None:
ri=mol.GetRingInfo()
arings = [set(x) for x in ri.AtomRings()]
spiros=set()
for i,ari in enumerate(arings):
for j in range(i+1,len(arings)):
shared=ari&arings[j]
if len(shared)==1:
spiros.update(shared)
nSpiro=len(spiros)
# find bonds that are shared between rings that share at least 2 bonds:
nBridge=0
brings = [set(x) for x in ri.BondRings()]
bridges=set()
for i,bri in enumerate(brings):
for j in range(i+1,len(brings)):
shared=bri&brings[j]
if len(shared)>1:
atomCounts=defaultdict(int)
for bi in shared:
bond = mol.GetBondWithIdx(bi)
atomCounts[bond.GetBeginAtomIdx()]+=1
atomCounts[bond.GetEndAtomIdx()]+=1
tmp=0
for ai,cnt in atomCounts.items():
if cnt==1:
tmp+=1
bridges.add(ai)
#if tmp!=2: # no need to stress the users
#print 'huh:',tmp
return len(bridges),nSpiro
def calculateScore(m):
if _fscores is None: readFragmentScores()
# fragment score
fp = rdMolDescriptors.GetMorganFingerprint(m,2) #<- 2 is the *radius* of the circular fingerprint
fps = fp.GetNonzeroElements()
score1 = 0.
nf = 0
for bitId,v in fps.iteritems():
nf += v
sfp = bitId
score1 += _fscores.get(sfp,-4)*v
score1 /= nf
# features score
nAtoms = m.GetNumAtoms()
nChiralCenters = len(Chem.FindMolChiralCenters(m,includeUnassigned=True))
ri = m.GetRingInfo()
nBridgeheads,nSpiro=numBridgeheadsAndSpiro(m,ri)
nMacrocycles=0
for x in ri.AtomRings():
if len(x)>8: nMacrocycles+=1
sizePenalty = nAtoms**1.005 - nAtoms
stereoPenalty = math.log10(nChiralCenters+1)
spiroPenalty = math.log10(nSpiro+1)
bridgePenalty = math.log10(nBridgeheads+1)
macrocyclePenalty = 0.
# ---------------------------------------
# This differs from the paper, which defines:
# macrocyclePenalty = math.log10(nMacrocycles+1)
# This form generates better results when 2 or more macrocycles are present
if nMacrocycles > 0: macrocyclePenalty = math.log10(2)
score2 = 0. -sizePenalty -stereoPenalty -spiroPenalty -bridgePenalty -macrocyclePenalty
# correction for the fingerprint density
# not in the original publication, added in version 1.1
# to make highly symmetrical molecules easier to synthetise
score3 = 0.
if nAtoms > len(fps):
score3 = math.log(float(nAtoms) / len(fps)) * .5
sascore = score1 + score2 + score3
# need to transform "raw" value into scale between 1 and 10
min = -4.0
max = 2.5
sascore = 11. - (sascore - min + 1) / (max - min) * 9.
# smooth the 10-end
if sascore > 8.: sascore = 8. + math.log(sascore+1.-9.)
if sascore > 10.: sascore = 10.0
elif sascore < 1.: sascore = 1.0
return sascore
def processMols(mols,outf):
print 'smiles\tName\tsa_score'
count = {}
for i,m in enumerate(mols):
if m is None:
continue
s = calculateScore(m)
smiles = Chem.MolToSmiles(m)
print smiles+"\t"+m.GetProp('_Name') + "\t%3f"%s
if __name__=='__main__':
import sys,gzip,time
outf = None
t1=time.time()
readFragmentScores("fpscores")
t2=time.time()
suppl = Chem.SmilesMolSupplier(sys.argv[1])
t3=time.time()
processMols(suppl,outf)
t4=time.time()
print >>sys.stderr,'Reading took %.2f seconds. Calculating took %.2f seconds'%((t2-t1),(t4-t3))
#
# Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
# nor the names of its contributors may be used to endorse or promote
# products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#