diff --git a/Contrib/SA_Score/README b/Contrib/SA_Score/README new file mode 100644 index 000000000..cdd442f8b --- /dev/null +++ b/Contrib/SA_Score/README @@ -0,0 +1,9 @@ +RDKit-based implementation of the method described in: + +Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions +Peter Ertl and Ansgar Schuffenhauer +Journal of Cheminformatics 1:8 (2009) +http://www.jcheminf.com/content/1/1/8 + +Contribution from Peter Ertl and Greg Landrum + diff --git a/Contrib/SA_Score/UnitTestSAScore.py b/Contrib/SA_Score/UnitTestSAScore.py new file mode 100644 index 000000000..59d8bed3b --- /dev/null +++ b/Contrib/SA_Score/UnitTestSAScore.py @@ -0,0 +1,34 @@ +from rdkit import RDConfig +from rdkit import Chem +import unittest,os.path +import sascorer +print sascorer.__file__ + +class TestCase(unittest.TestCase): + def test1(self): + testData = [x.strip().split('\t') for x in file('data/zim.100.txt').readlines()] + testData.pop(0) + for row in testData: + smi = row[0] + m = Chem.MolFromSmiles(smi) + tgt = float(row[2]) + val = sascorer.calculateScore(m) + self.failUnlessAlmostEqual(tgt,val,3) + +if __name__ == '__main__': + import sys,getopt,re + doLong=0 + if len(sys.argv) >1: + args,extras=getopt.getopt(sys.argv[1:],'l') + for arg,val in args: + if arg=='-l': + doLong=1 + sys.argv.remove('-l') + if doLong: + for methName in dir(TestCase): + if re.match('_test',methName): + newName = re.sub('_test','test',methName) + exec('TestCase.%s = TestCase.%s'%(newName,methName)) + + unittest.main() + diff --git a/Contrib/SA_Score/data/zim.100.txt b/Contrib/SA_Score/data/zim.100.txt new file mode 100644 index 000000000..f8eb6f413 --- /dev/null +++ b/Contrib/SA_Score/data/zim.100.txt @@ -0,0 +1,101 @@ +smiles Name sa_score +Cc1c(C(=O)NCCO)[n+](=O)c2ccccc2n1[O-] ZINC21984717 3.166 +Cn1cc(NC=O)cc1C(=O)Nc1cc(C(=O)Nc2cc(C(=O)NCCC(N)=[NH2+])n(C)c2)n(C)c1 ZINC03872327 3.328 +OC(c1ccncc1)c1ccc(OCC[NH+]2CCCC2)cc1 ZINC34421620 3.822 +CC(C(=O)[O-])c1ccc(-c2ccccc2)cc1 ZINC00000361 2.462 +C[NH+](C)CC(O)Cn1c2ccc(Br)cc2c2cc(Br)ccc21 ZINC00626529 3.577 +NC(=[NH2+])NCC1COc2ccccc2O1 ZINC00000357 3.290 +CCC(C)(C)[NH2+]CC(O)COc1ccccc1C#N ZINC04214111 3.698 +C[NH+](C)CC(O)Cn1c2ccc(Br)cc2c2cc(Br)ccc21 ZINC00626528 3.577 +CC12CCC3C(CCC4CC(=O)CCC43C)C1CCC2=O ZINC04081985 3.912 +COc1ccc(OC(=O)N(CC(=O)[O-])Cc2ccc(OCCc3nc(-c4ccccc4)oc3C)cc2)cc1 ZINC03935839 2.644 +COc1ccccc1OC(=O)c1ccccc1 ZINC00000349 1.342 +CC(C)CC[NH2+]CC1COc2ccccc2O1 ZINC04214115 3.701 +CN1CCN(C(=O)OC2c3nccnc3C(=O)N2c2ccc(Cl)cn2)CC1 ZINC19632834 3.196 +CCC1(c2ccccc2)C(=O)N(COC)C(=O)N(COC)C1=O ZINC02986592 2.759 +Nc1ccc(S(=O)(=O)Nc2ccccc2)cc1 ZINC00141883 1.529 +O=C([O-])CCCNC(=O)NC1CCCCC1 ZINC08754389 2.493 +CCC(C)C(C(=O)OC1CC[N+](C)(C)CC1)c1ccccc1 ZINC00000595 3.399 +CCC(C)SSc1ncc[nH]1 ZINC13209429 3.983 +CC[N+](C)(CC)CCOC(=O)C(O)(c1cccs1)C1CCCC1 ZINC01690860 3.471 +CC12CCC3C(CCC4CC(=O)CCC43C)C1CCC2O ZINC03814360 3.994 +CC12CCC3C4CCC(=O)C=C4CCC3C1CCC2O ZINC03814379 4.056 +OCC1OC(OC2C(CO)OC(O)C(O)C2O)C(O)C(O)C1O ZINC04095762 4.282 +CC(C)CC(CC[NH+](C(C)C)C(C)C)(C(N)=O)c1ccccn1 ZINC02016048 4.092 +C=CC1(C)CC(=O)C2(O)C(C)(O1)C(OC(C)=O)C(OC(=O)CC[NH+](C)C)C1C(C)(C)CCC(O)C12C ZINC38595287 5.519 +C=CC[NH+]1CCCC1CNC(=O)c1cc(S(N)(=O)=O)cc(OC)c1OC ZINC00601278 4.286 +CC(=O)OC1C[NH+]2CCC1CC2 ZINC00492792 5.711 +CC12CCC3C(CCC4CC(=O)CCC43C)C1CCC2O ZINC03814418 3.994 +CC1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3CCC21C ZINC03814422 4.022 +CC(=O)OC1(C(C)=O)CCC2C3C=C(Cl)C4=CC(=O)C5CC5C4(C)C3CCC21C ZINC03814423 4.827 +C#CC1(O)CCC2C3CCc4cc(OC)ccc4C3CCC21C ZINC03815424 3.810 +C=CC1(C)CC(OC(=O)CSCC[NH+](CC)CC)C2(C)C3C(=O)CCC3(CCC2C)C(C)C1O ZINC25757051 6.200 +O=C([O-])C(=O)Nc1nc(-c2ccc3c(c2)OCCO3)cs1 ZINC03623428 2.594 +CC[NH+]1CCCC1CNC(=O)C(O)(c1ccccc1)c1ccccc1 ZINC00900569 3.950 +CC(C)(OCc1nn(Cc2ccccc2)c2ccccc12)C(=O)[O-] ZINC00004594 2.573 +Cc1nnc(C(C)C)n1C1CC2CCC(C1)[NH+]2CCC(NC(=O)C1CCC(F)(F)CC1)c1ccccc1 ZINC03817234 5.316 +Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O ZINC03871612 5.290 +O=C([O-])CNC(=O)c1ccccc1 ZINC00097685 2.097 +Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O ZINC03871613 5.290 +Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O ZINC03871614 5.290 +c1ccc(OCc2ccc(CCCN3CCOCC3)cc2)cc1 ZINC19865692 1.702 +CC=CC1=C(C(=O)[O-])N2C(=O)C(NC(=O)C(N)c3ccc(O)cc3)C2SC1 ZINC20444132 4.042 +C[NH+]1CCCC1COc1cccnc1 ZINC03805141 4.510 +O=C([O-])C(O)CC(O)C(O)CO ZINC04803503 4.398 +O=C([O-])C(O)CC(O)C(O)CO ZINC01696607 4.398 +C[NH+]1CCCC1Cc1c[nH]c2ccc(CCS(=O)(=O)c3ccccc3)cc12 ZINC03823475 3.921 +C(=Cc1ccccc1)C[NH+]1CCN(C(c2ccccc2)c2ccccc2)CC1 ZINC19632891 2.973 +Nc1ncnc2c1ncn2C1OC(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C(O)C1O ZINC03871615 5.290 +CC(c1ccccc1)N(C)C=O ZINC06932229 2.562 +CC(=O)C1CCC2C3CCC4CC(C)(O)CCC4(C)C3CCC12C ZINC03824281 4.279 +O=C([O-])C(O)CC(O)C(O)CO ZINC04803506 4.398 +COc1cc(O)c(C(=O)c2ccccc2)c(O)c1 ZINC00000187 1.868 +O=C([O-])C(O)CC(O)C(O)CO ZINC04803507 4.398 +COc1c2c(cc3c1C(O)N(C)CC3)OCO2 ZINC00000186 3.183 +CCC(C(=O)[O-])c1ccc(CC(C)C)cc1 ZINC00015537 2.827 +O=C([O-])C1[NH+]=C(c2ccccc2)c2cc(Cl)ccc2NC1(O)O ZINC38611850 4.011 +O=C([O-])C1[NH+]=C(c2ccccc2)c2cc(Cl)ccc2NC1(O)O ZINC38611851 4.011 +OCC(O)COc1ccc(Cl)cc1 ZINC00000135 2.102 +NC(=O)NC(=O)C(Cl)c1ccccc1 ZINC00000134 2.455 +OC(c1ccccc1)(c1ccccc1)C1C[NH+]2CCC1CC2 ZINC01298963 4.530 +C[NH2+]CC(C)c1ccccc1 ZINC04298801 3.471 +Clc1cccc(Cl)c1N=C1NCCO1 ZINC13835972 3.267 +[NH3+]C(Cc1ccccc1)C(=O)CCl ZINC02504633 3.251 +CC(C)Cn1cnc2c1c1ccccc1nc2N ZINC19632912 2.230 +CC(O)CN(C)c1ccc(NN)nn1 ZINC00000624 3.193 +CC1(O)CCC2C3CCC4=CC(=O)CCC4=C3C=CC21C ZINC00001727 4.461 +CCC(C(=O)[O-])c1ccc(-c2ccccc2)cc1 ZINC00000111 2.505 +CC(=O)OCC1OC(n2ncc(=O)[nH]c2=O)C(OC(C)=O)C1OC(C)=O ZINC03830255 3.832 +CC(=O)OCC1OC(n2ncc(=O)[nH]c2=O)C(OC(C)=O)C1OC(C)=O ZINC03830256 3.832 +Cn1cc(C(=O)c2cccc3ccccc32)cc1C(=O)[O-] ZINC00001783 2.456 +CC(=O)OCC1OC(n2ncc(=O)[nH]c2=O)C(OC(C)=O)C1OC(C)=O ZINC03830257 3.832 +Cc1cccc(-c2nc3ccccc3c(Nc3ccc4[nH]ncc4c3)n2)n1 ZINC39279791 2.358 +O=C([O-])C1CC2CCCCC2[NH2+]1 ZINC04899687 5.422 +CC(=O)OCC(=O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C ZINC00538219 4.187 +O=C([O-])C1CC2CCCCC2[NH2+]1 ZINC04899686 5.422 +O=C(OCc1ccccc1)C(O)c1ccccc1 ZINC00000078 2.038 +CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)C=CC4(C)C3C(O)CC21C ZINC00608041 4.394 +Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1 ZINC02570895 2.144 +COCc1cccc(CC(O)C=CC2C(O)CC(=O)C2CCSCCCC(=O)OC)c1 ZINC03940680 3.934 +CCC(=O)N(c1ccccc1)C1CC[NH+](C(C)Cc2ccccc2)CC1 ZINC01664586 3.582 +CCC(=O)N(c1ccccc1)C1CC[NH+](C(C)Cc2ccccc2)CC1 ZINC01664587 3.582 +CCOC(=O)Nc1ccc2c(c1)N(C(=O)CCN1CCOCC1)c1ccccc1S2 ZINC19340795 2.446 +O=C([O-])Cc1cc(=O)[nH]c(=O)[nH]1 ZINC00403617 3.258 +NC(=O)C([NH3+])Cc1c[nH]c2ccccc12 ZINC04899521 3.224 +NC(=O)C([NH3+])Cc1ccc(O)cc1 ZINC04899513 3.280 +O=C(c1cc2ccccc2o1)N1CCN(Cc2ccccc2)CC1 ZINC19632922 1.799 +O=C(CO)C(O)C(O)CO ZINC00902219 3.473 +CC(Cc1ccccc1)NC(=O)C([NH3+])CCCC[NH3+] ZINC11680943 3.967 +C[NH+]1CCC(c2c(O)cc(=O)c3c(O)cc(-c4ccccc4Cl)oc2-3)C(O)C1 ZINC05966679 4.616 +CN(C)c1ccc(O)c2c1CC1CC3C([NH+](C)C)C(=O)C(C(N)=O)=C(O)C3(O)C(=O)C1=C2O ZINC04019704 4.713 +Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(CC(O)C(O)C(O)CO)c2cc1C ZINC03650334 3.791 +C[NH+]1C2CCC1CC(OC(=O)c1c[nH]c3ccccc13)C2 ZINC18130447 4.892 +Cc1ccccc1NC(=O)C(C)[NH+]1CCCC1 ZINC00000051 3.809 +O=S(=O)([O-])CCN1CCOCC1 ZINC19419111 2.776 +C[NH+]1CCN(CC(=O)N2c3ccccc3C(=O)Nc3cccnc32)CC1 ZINC19632927 3.379 +CCCCCC=CCC=CCCCCCCCC(=O)[O-] ZINC03802188 2.805 +CC(CC([NH3+])C(=O)[O-])C(=O)[O-] ZINC01747048 5.690 +CC1c2cccc(O)c2C(=O)C2=C(O)C3(O)C(O)=C(C(N)=O)C(=O)C([NH+](C)C)C3C(O)C21 ZINC04019706 5.069 +Cc1cc2nc3nc([O-])[nH]c(=O)c3nc2cc1C ZINC12446789 3.079 +CC1=CC(C)C2(CO)COC(c3ccc(O)cc3)C1C2C ZINC38190856 4.749 +CC[NH+]1CCC(=C2c3ccccc3CCc3ccccc32)C1C ZINC02020004 3.925 diff --git a/Contrib/SA_Score/fpscores.pkl.gz b/Contrib/SA_Score/fpscores.pkl.gz new file mode 100644 index 000000000..0f0abe2a9 Binary files /dev/null and b/Contrib/SA_Score/fpscores.pkl.gz differ diff --git a/Contrib/SA_Score/sascorer.py b/Contrib/SA_Score/sascorer.py new file mode 100644 index 000000000..9bf71150b --- /dev/null +++ b/Contrib/SA_Score/sascorer.py @@ -0,0 +1,183 @@ +# +# calculation of synthetic accessibility score as described in: +# +# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions +# Peter Ertl and Ansgar Schuffenhauer +# Journal of Cheminformatics 1:8 (2009) +# http://www.jcheminf.com/content/1/1/8 +# +# several small modifications to the original paper are included +# particularly slightly different formula for marocyclic penalty +# and taking into account also molecule symmetry (fingerprint density) +# +# for a set of 10k diverse molecules the agreement between the original method +# as implemented in PipelinePilot and this implementation is r2 = 0.97 +# +# peter ertl & greg landrum, september 2013 +# + +from rdkit import Chem +from rdkit.Chem import rdMolDescriptors + +import math +from collections import defaultdict + +_fscores = None + +def readFragmentScores(name='fpscores'): + import cPickle,gzip + global _fscores + _fscores = cPickle.load(gzip.open('%s.pkl.gz'%name)) + +def numBridgeheadsAndSpiro(mol,ri=None): + if ri is None: + ri=mol.GetRingInfo() + arings = [set(x) for x in ri.AtomRings()] + spiros=set() + for i,ari in enumerate(arings): + for j in range(i+1,len(arings)): + shared=ariås[j] + if len(shared)==1: + spiros.update(shared) + nSpiro=len(spiros) + + # find bonds that are shared between rings that share at least 2 bonds: + nBridge=0 + brings = [set(x) for x in ri.BondRings()] + bridges=set() + for i,bri in enumerate(brings): + for j in range(i+1,len(brings)): + shared=bri&brings[j] + if len(shared)>1: + atomCounts=defaultdict(int) + for bi in shared: + bond = mol.GetBondWithIdx(bi) + atomCounts[bond.GetBeginAtomIdx()]+=1 + atomCounts[bond.GetEndAtomIdx()]+=1 + tmp=0 + for ai,cnt in atomCounts.items(): + if cnt==1: + tmp+=1 + bridges.add(ai) + #if tmp!=2: # no need to stress the users + #print 'huh:',tmp + return len(bridges),nSpiro + +def calculateScore(m): + if _fscores is None: readFragmentScores() + + # fragment score + fp = rdMolDescriptors.GetMorganFingerprint(m,2) #<- 2 is the *radius* of the circular fingerprint + fps = fp.GetNonzeroElements() + score1 = 0. + nf = 0 + for bitId,v in fps.iteritems(): + nf += v + sfp = bitId + score1 += _fscores.get(sfp,-4)*v + score1 /= nf + + # features score + nAtoms = m.GetNumAtoms() + nChiralCenters = len(Chem.FindMolChiralCenters(m,includeUnassigned=True)) + ri = m.GetRingInfo() + nBridgeheads,nSpiro=numBridgeheadsAndSpiro(m,ri) + nMacrocycles=0 + for x in ri.AtomRings(): + if len(x)>8: nMacrocycles+=1 + + sizePenalty = nAtoms**1.005 - nAtoms + stereoPenalty = math.log10(nChiralCenters+1) + spiroPenalty = math.log10(nSpiro+1) + bridgePenalty = math.log10(nBridgeheads+1) + macrocyclePenalty = 0. + # --------------------------------------- + # This differs from the paper, which defines: + # macrocyclePenalty = math.log10(nMacrocycles+1) + # This form generates better results when 2 or more macrocycles are present + if nMacrocycles > 0: macrocyclePenalty = math.log10(2) + + score2 = 0. -sizePenalty -stereoPenalty -spiroPenalty -bridgePenalty -macrocyclePenalty + + # correction for the fingerprint density + # not in the original publication, added in version 1.1 + # to make highly symmetrical molecules easier to synthetise + score3 = 0. + if nAtoms > len(fps): + score3 = math.log(float(nAtoms) / len(fps)) * .5 + + sascore = score1 + score2 + score3 + + # need to transform "raw" value into scale between 1 and 10 + min = -4.0 + max = 2.5 + sascore = 11. - (sascore - min + 1) / (max - min) * 9. + # smooth the 10-end + if sascore > 8.: sascore = 8. + math.log(sascore+1.-9.) + if sascore > 10.: sascore = 10.0 + elif sascore < 1.: sascore = 1.0 + + return sascore + + +def processMols(mols,outf): + + print 'smiles\tName\tsa_score' + count = {} + for i,m in enumerate(mols): + if m is None: + continue + + s = calculateScore(m) + + smiles = Chem.MolToSmiles(m) + print smiles+"\t"+m.GetProp('_Name') + "\t%3f"%s + + +if __name__=='__main__': + import sys,gzip,time + + outf = None + + t1=time.time() + readFragmentScores("fpscores") + t2=time.time() + + suppl = Chem.SmilesMolSupplier(sys.argv[1]) + t3=time.time() + processMols(suppl,outf) + t4=time.time() + + print >>sys.stderr,'Reading took %.2f seconds. Calculating took %.2f seconds'%((t2-t1),(t4-t3)) + + +# +# Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote +# products derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#