Files
rdkit/Contrib/RxnRoleAssignment/identifyReactants.py
Ric 880a8e5725 Reformat Python code for 2023.03 release (#6294)
* run yapf

* run isort

---------

Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
2023-04-28 06:53:56 +02:00

461 lines
19 KiB
Python

#
# Copyright (c) 2016, Novartis Institutes for BioMedical Research Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
# nor the names of its contributors may be used to endorse or promote
# products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Created by Nadine Schneider, July 2016
import itertools
from collections import Counter, defaultdict
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, rdqueries
from . import utils
class MoleculeDetails(object):
__slots__ = [
'detailFP', 'scaffoldFP', 'bitInfoDetailFP', 'bitInfoScaffoldFP', 'reactivity', 'bitReactivity',
'molecule'
]
def _atomDetailInvariant(self, mol):
mol.UpdatePropertyCache(False)
num_atoms = mol.GetNumAtoms()
Chem.GetSSSR(mol)
rinfo = mol.GetRingInfo()
invariants = [0] * num_atoms
for i, a in enumerate(mol.GetAtoms()):
descriptors = []
descriptors.append(a.GetAtomicNum())
descriptors.append(a.GetTotalDegree())
descriptors.append(a.GetTotalNumHs())
descriptors.append(rinfo.IsAtomInRingOfSize(a.GetIdx(), 6))
descriptors.append(rinfo.IsAtomInRingOfSize(a.GetIdx(), 5))
descriptors.append(a.IsInRing())
descriptors.append(a.GetIsAromatic())
invariants[i] = hash(tuple(descriptors)) & 0xffffffff
return invariants
def _atomScaffoldInvariant(self, mol):
num_atoms = mol.GetNumAtoms()
invariants = [0] * num_atoms
for i, a in enumerate(mol.GetAtoms()):
descriptors = []
descriptors.append(a.GetAtomicNum())
invariants[i] = hash(tuple(descriptors)) & 0xffffffff
return invariants
def _createFP(self, mol, invariant, bitinfo, useBondTypes=True, radius=1):
return AllChem.GetMorganFingerprint(mol=mol, radius=radius, invariants=invariant,
useBondTypes=useBondTypes, bitInfo=bitinfo)
def _isHeteroAtom(self, a):
return a.GetAtomicNum() not in (6, 1)
def _isSp3OrAromaticCarbon(self, a):
if a.GetAtomicNum() != 6:
return False
if a.GetIsAromatic():
return True
for b in a.GetBonds():
if b.GetBondTypeAsDouble() > 1.5:
return False
return True
def _calcReactivityAtom(self, a):
# exclude sp3 carbons or uncharged single heavy atoms such as water molecules
if self._isSp3OrAromaticCarbon(a) or (len(a.GetNeighbors()) == 0 and a.GetFormalCharge() == 0):
return 0
# all other atoms have at least a reactivity of one
reactivity = 1
b = a.GetBonds()
# if it is a heteroatom or has an H (we already know it's not SP3 or aromatic) increase the reactivity
if self._isHeteroAtom(a) or a.GetTotalNumHs() > 0:
reactivity += 1
# slightly increase reactivity for atoms in aromatic rings compared to aliphatic rings
if a.IsInRing():
if a.GetIsAromatic():
reactivity += 0.5
# but prefer non-ring atoms
else:
reactivity += 1
# increase reactivity of charged atoms
if a.GetFormalCharge():
reactivity += 2
for bo in b:
# look at the direct neighbors of the atom
ni = bo.GetOtherAtom(a)
# for non-single bonds increase the reactivity
if bo.GetBondTypeAsDouble() > 1.5:
reactivity += 1
# if there are hydrogens attached, increase the reactivity
if ni.GetTotalNumHs() > 0:
reactivity += 1
# if it is a bond to a hetero atom further increase the reactivity
if self._isHeteroAtom(ni):
reactivity += 1
# bonds between nitrogens and oxygen or between oxygen and oxygen or between nitrogen and nitrogen are more reactive
if a.GetAtomicNum() in (7, 8) and ni.GetAtomicNum() in (7, 8):
reactivity += 2
# if the neighbor is a Mg, Si, P, Pd, or Sn atom increase the reactivity
elif ni.GetAtomicNum() in (12, 14, 15, 46, 50):
reactivity += 1
return reactivity
def _calcReactivityMolecule(self, mol):
reactivityAtoms = [self._calcReactivityAtom(a) for a in mol.GetAtoms()]
return reactivityAtoms
def __init__(self, molecule, verbose=0):
self.molecule = molecule
self.bitInfoDetailFP = {}
self.detailFP = self._createFP(molecule, self._atomDetailInvariant(molecule),
self.bitInfoDetailFP)
self.bitInfoScaffoldFP = {}
self.scaffoldFP = self._createFP(molecule, self._atomScaffoldInvariant(molecule),
self.bitInfoScaffoldFP, useBondTypes=False)
reactivityAtoms = self._calcReactivityMolecule(molecule)
reactivity = sum(reactivityAtoms)
if Chem.MolToSmiles(molecule) in frequentReagents:
reactivity *= 0.8
self.reactivity = reactivity
def _calcScore(reactantFP, productFP, bitInfoProd=None, output=False):
if output:
print("--- _calcScore ---")
score = 0
dFP = productFP - reactantFP
numRBits = float(utils.getNumPositiveCounts(reactantFP))
if output > 2:
print("num RBits: ", numRBits)
numPBits = float(utils.getNumPositiveCounts(productFP))
if output > 2:
print("num PBits: ", numPBits)
numUnmappedPBits = float(utils.getNumPositiveCounts(dFP))
if output > 2:
print("num UnmappedPBits: ", numUnmappedPBits)
numUnmappedRBits = float(utils.getNumNegativeCounts(dFP))
if output > 2:
print("num UnmappedRBits: ", numUnmappedRBits)
numUnmappedPAtoms = -1
bitsUnmappedPAtoms = -1
if bitInfoProd is not None:
numUnmappedPAtoms, bitsUnmappedPAtoms = utils.getNumPositiveBitCountsOfRadius0(dFP, bitInfoProd)
if output > 2:
print("num UnmappedPAtoms: ", numUnmappedPAtoms)
ratioMappedPBits = 1 - (numUnmappedPBits / numPBits)
ratioUnmappedRBits = numUnmappedRBits / numRBits
score = max(ratioMappedPBits - ratioUnmappedRBits * ratioUnmappedRBits, 0)
if output > 1:
print("score: ", score, "(", ratioMappedPBits, ",", ratioUnmappedRBits * ratioUnmappedRBits,
",", ratioUnmappedRBits, ")")
return [score, numUnmappedPBits, numUnmappedPAtoms, bitsUnmappedPAtoms]
# Set of frequent reagents derived from all patent reactions
frequentReagents = set([
'CCN(CC)CC', '[Li+]', '[Na+]', 'O=C(O)CC(O)(CC(=O)O)C(=O)O', 'O=S(=O)(O)O', 'CN1CCCC1=O',
'CCN(C(C)C)C(C)C', 'c1ccncc1', '[K]', 'CC(C)(C)O', 'CCO', 'Cc1ccc(S(=O)(=O)O)cc1',
'ClC(Cl)(Cl)Cl', '[Na]', 'CC(C)(C)[O-]', 'O=C([O-])O', 'COCCOC', '[NH4+]', 'CC(C)OC(C)C',
'O=C([O-])[O-]', 'CC(=O)OC(C)=O', 'O=C=O', '[Cl-]', 'c1ccc(P(c2ccccc2)c2ccccc2)cc1', '[H-]',
'N#N', 'CN1CCOCC1', 'C1COCCO1', 'c1ccccc1', '[Cs+]', '[K+]', '[OH-]', 'CCCCCC', 'CCCCC',
'CN(C)C=O', 'C[O-]', 'Cc1ccccc1', 'C1CCC2=NCCCN2CC1', 'CO', 'CCCCO', 'O=C(O)C(F)(F)F',
'O=P([O-])([O-])[O-]', 'CCOC(C)=O', '[Mg+2]', 'C1CCCCC1', 'O', 'N', 'II', 'O=CO', 'CC(=O)N(C)C',
'CC(=O)O', 'CCOCC', 'CC(C)O', 'C[Si](C)(C)Cl', 'Cc1ccccc1C', 'CC(C)=O', 'CS(=O)(=O)O',
'CN(C)c1ccncc1', 'Cl', 'ClCCCl', 'O=S(Cl)Cl', 'ClC(Cl)Cl', '[Li]CCCC', '[Pd]', '[H][H]', '[Br-]',
'CS(C)=O', 'COC(C)(C)C', 'O=S(=O)([O-])[O-]', 'CC(Cl)Cl', 'CC(=O)[O-]',
'CCCC[N+](CCCC)(CCCC)CCCC', 'ClCCl', 'CC#N', 'C1CCOC1', 'CCCCCCC'
])
def _getBestCombination(rfps, pfps, output=False):
if output:
print("--- _getBestCombination ---")
tests = []
numReactants = len(rfps)
# generate first all reactant combinations
for i in range(1, numReactants + 1):
for x in itertools.combinations(range(numReactants), i):
temp = []
for j in x:
# don't include frequent reagents
if not rfps[j][1]:
numAtms = rfps[j][0].molecule.GetNumAtoms()
# not test single ions
if numAtms > 1:
# store the number of reactant atoms for later
temp.append((rfps[j][0].molecule.GetNumAtoms(), j))
else:
if output > 3:
print("Frequent reagent found: ", j)
if temp not in tests:
tests.append(temp)
# initialisation of the results
maxScore = 0
maxDetailScore = 0
finalReacts = [[]]
# get the product fingerprints
productsDetailFP = utils.getSumFps([i.detailFP for i in pfps])
productsScaffoldFP = utils.getSumFps([i.scaffoldFP for i in pfps])
# get the number of atoms for the product
numProductAtoms = 0
for i in pfps:
numProductAtoms += i.molecule.GetNumAtoms()
# get the bitinfo for the product FP
productsDetailFPBitInfo = {}
productsScaffoldFPBitInfo = {}
for i in pfps:
productsDetailFPBitInfo.update(i.bitInfoDetailFP)
productsScaffoldFPBitInfo.update(i.bitInfoScaffoldFP)
# set some initial values
numUnmappedPAtoms, bitsUnmappedPAtoms = utils.getNumPositiveBitCountsOfRadius0(
productsScaffoldFP, productsScaffoldFPBitInfo)
finalNumUnmappedProdAtoms = [[
len(productsDetailFP.GetNonzeroElements()),
len(productsScaffoldFP.GetNonzeroElements()), numUnmappedPAtoms, bitsUnmappedPAtoms
]]
for test in tests:
if len(test) < 1:
continue
# get the number of involved reactant atoms
numReactantAtoms = np.array(test)[:, 0].sum()
# ignore combinations including too many or too few atoms
if numReactantAtoms > 5 * numProductAtoms or numReactantAtoms < numProductAtoms * 0.8:
continue
if output > 0:
print("Combination: ", test)
#build the combined reactant FPs
reactantsDetailFP = utils.getSumFps([rfps[i[1]][0].detailFP for i in test])
reactantsScaffoldFP = utils.getSumFps([rfps[i[1]][0].scaffoldFP for i in test])
# get the scores for both FPs
detailFPScore = _calcScore(reactantsDetailFP, productsDetailFP,
bitInfoProd=productsDetailFPBitInfo, output=output)
scaffoldFPScore = _calcScore(reactantsScaffoldFP, productsScaffoldFP,
bitInfoProd=productsScaffoldFPBitInfo, output=output)
# final score
score = detailFPScore[0] + scaffoldFPScore[0]
if output > 0:
print(">>>> score: ", score)
print(">>>> scores (detail, scaffold): ", detailFPScore[0], scaffoldFPScore[0])
print(">>>> num unmapped productFP bits: ", detailFPScore[1], scaffoldFPScore[1],
detailFPScore[2], scaffoldFPScore[2])
if score > maxScore:
maxScore = score
maxDetailScore = detailFPScore[0]
del finalReacts[:]
del finalNumUnmappedProdAtoms[:]
# set the final reactants
finalReacts.append([i[1] for i in test])
# for tracking the mapping of the product atoms include the number of unmapped detailedFP bits, the number of unmapped
# atoms based on the scaffold FP, the number of unmapped scaffoldFP bits, and the unmapped scaffoldFP bits
finalNumUnmappedProdAtoms.append(
[detailFPScore[1], scaffoldFPScore[2], scaffoldFPScore[1], scaffoldFPScore[-1]])
if output > 0:
print(" >> maxScore: ", maxScore)
print(" >> Final reactants: ", finalReacts)
# test for almost perfect matchings (e.g. oxidations, reduction etc.)
if scaffoldFPScore[0] > 0.9999 and detailFPScore[0] > 0.8:
return finalReacts, finalNumUnmappedProdAtoms
# test for number of mapped product atoms e.g. to capture deprotections earlier
if len(finalNumUnmappedProdAtoms) > 0 and len(test) == 1:
if finalNumUnmappedProdAtoms[0][1] == 0 and finalNumUnmappedProdAtoms[0][0] <= 3:
return finalReacts, finalNumUnmappedProdAtoms
# include alternative solutions
elif abs(score - maxScore) < 0.0000001 and score > 0.0:
finalReacts.append([i[1] for i in test])
finalNumUnmappedProdAtoms.append(
[detailFPScore[1], scaffoldFPScore[2], scaffoldFPScore[1], scaffoldFPScore[-1]])
if output > 0:
print(" >> Added alternative result")
print(" >> Final reactants: ", finalReacts)
return finalReacts, finalNumUnmappedProdAtoms
def _findMissingReactiveReactants(rfps, pfps, currentReactants, unmappedPAtoms, output=False):
if output:
print("--- _findMissingReactiveReactants ---")
if not len(unmappedPAtoms):
return currentReactants
# if there are unmapped product bits find possible reactants for those
else:
finalReactants = []
numReactants = len(rfps)
# investigate all possible solutions of the scoring before
for reacts, umPA in zip(currentReactants, unmappedPAtoms):
# if there are unmapped product atoms find possible reactants for those
finalReactants.append(reacts)
if umPA[1] > 0:
remainingReactants = set(range(numReactants)).difference(set(reacts))
# sort the possible reactants by the reactivity
remainingReactants = sorted(
remainingReactants,
key=lambda x: rfps[x].reactivity / float(rfps[x].molecule.GetNumAtoms()), reverse=True)
missingPAtoms = []
# get the missing atoms and counts
for bit, c in umPA[-1]:
for pbi in range(len(pfps)):
if bit in pfps[pbi].bitInfoScaffoldFP:
a = pfps[pbi].bitInfoScaffoldFP[bit][0]
missingPAtoms.extend([pfps[pbi].molecule.GetAtomWithIdx(a[0]).GetAtomicNum()] * c)
missingPAtoms = Counter(missingPAtoms)
if output > 0:
print(missingPAtoms)
# build queries for the missing atoms
queries = [(rdqueries.AtomNumEqualsQueryAtom(a), a) for a in missingPAtoms]
maxFullfilledQueries = 0
maxReactivity = -1
addReactants = []
# search for the most reactive reactants capturing all/most of the unmapped product atoms
for r in remainingReactants:
if output > 0:
print(" >> Reactant", r, rfps[r].reactivity / float(rfps[r].molecule.GetNumAtoms()))
countFullfilledQueries = 0
for q, a in queries:
if len(rfps[r].molecule.GetAtomsMatchingQuery(q)) >= missingPAtoms[a]:
countFullfilledQueries += 1
if output > 0:
print(" Max reactivity", maxReactivity)
print(" Max fulfilled queries", maxFullfilledQueries)
if countFullfilledQueries > maxFullfilledQueries:
maxFullfilledQueries = countFullfilledQueries
maxReactivity = rfps[r].reactivity / float(rfps[r].molecule.GetNumAtoms())
addReactants = [r]
elif maxFullfilledQueries and countFullfilledQueries == maxFullfilledQueries and \
rfps[r].reactivity/float(rfps[r].molecule.GetNumAtoms()) >= maxReactivity:
maxFullfilledQueries = countFullfilledQueries
addReactants.append(r)
if output > 0:
print(" Added reactants", addReactants)
finalReactants[-1].extend(addReactants)
if output > 0:
print(" >> Final reactants", finalReactants)
return finalReactants
def _detectObviousReagents(reactants, products):
unchangedReacts = set()
unchangedProds = set()
for i, r in enumerate(reactants):
for j, p in enumerate(products):
if r == p:
unchangedReacts.add(i)
unchangedProds.add(j)
return unchangedReacts, unchangedProds
def identifyReactants(reaction, output=False):
rxn = AllChem.ChemicalReaction(reaction)
AllChem.RemoveMappingNumbersFromReactions(rxn)
if output:
print("--- identifyReactants ---")
reactants = rxn.GetReactants()
products = rxn.GetProducts()
### Preprocessing
uniqueReactants, reactantSmiles = utils.uniqueMolecules(reactants)
uniqueProducts, productSmiles = utils.uniqueMolecules(products)
# find molecules which do not change in the rxn
unmodifiedReactants, unmodifiedProducts = _detectObviousReagents(reactantSmiles, productSmiles)
if output:
print(" >>> Found reagents in reactants:", unmodifiedReactants)
print(" >>> Found reagents in products:", unmodifiedProducts)
if len(products) == len(unmodifiedProducts):
unmodifiedProducts = set()
uniquePotentialReactants = [r for r in sorted(set(uniqueReactants.values()))]
uniquePotentialProducts = [
p for p in sorted(set(uniqueProducts.values())) if p not in unmodifiedProducts
]
### Find the most probable reactants
# only generate moleculeDetail objects for unique, potential reactants and products
rfps = [MoleculeDetails(reactants[r]) for r in uniquePotentialReactants]
pfps = [MoleculeDetails(products[p]) for p in uniquePotentialProducts]
rfpsPrep = [(MoleculeDetails(reactants[r]), reactantSmiles[r] in frequentReagents)
for r in uniquePotentialReactants]
reacts, unmappedProdAtoms = _getBestCombination(rfpsPrep, pfps, output=output)
# no reactants where found try again including the frequent reagents
if np.array(reacts).shape == (1, 0):
rfpsPrep = [(MoleculeDetails(reactants[r]), 0) for r in uniquePotentialReactants]
reacts, unmappedProdAtoms = _getBestCombination(rfpsPrep, pfps, output=output)
### Postprocessing
# identify missing reactants
reacts = _findMissingReactiveReactants(rfps, pfps, reacts, unmappedProdAtoms, output=output)
finalreacts = []
for i in reacts:
temp = [uniquePotentialReactants[j] for j in i]
finalreacts.append(set(temp))
return finalreacts, unmodifiedReactants, unmodifiedProducts
# reassign the reaction roles of a reaction
def reassignRXNRoles(rxn):
utils.transferAgentsToReactants(rxn)
reacts, rAgents, pAgents = identifyReactants(rxn)
if len(reacts) < 1:
return None
new_rxn = AllChem.ChemicalReaction()
for i in range(rxn.GetNumProductTemplates()):
new_rxn.AddProductTemplate(rxn.GetProductTemplate(i))
for i in range(rxn.GetNumReactantTemplates()):
if i in reacts[0]:
new_rxn.AddReactantTemplate(rxn.GetReactantTemplate(i))
else:
new_rxn.AddAgentTemplate(rxn.GetReactantTemplate(i))
return new_rxn
# clean-up the reaction smiles
def reassignReactionRoles(smi):
rxn = AllChem.ReactionFromSmarts(smi, useSmiles=True)
new_rxn = reassignRXNRoles(rxn)
if new_rxn is None:
return ''
smi_new = AllChem.ReactionToSmiles(new_rxn)
return smi_new