File: identifyReactants.py

package info (click to toggle)
rdkit 201809.1%2Bdfsg-6
links: PTS, VCS
area: main
in suites: buster
size: 123,688 kB
sloc: cpp: 230,509; python: 70,501; java: 6,329; ansic: 5,427; sql: 1,899; yacc: 1,739; lex: 1,243; makefile: 445; xml: 229; fortran: 183; sh: 123; cs: 93
file content (433 lines) | stat: -rw-r--r-- 20,644 bytes
#
#  Copyright (c) 2016, Novartis Institutes for BioMedical Research Inc.
#  All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: 
#
#     * Redistributions of source code must retain the above copyright 
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
#       copyright notice, this list of conditions and the following 
#       disclaimer in the documentation and/or other materials provided 
#       with the distribution.
#     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
#       nor the names of its contributors may be used to endorse or promote 
#       products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Created by Nadine Schneider, July 2016



from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdqueries

from collections import defaultdict, Counter
import itertools
import numpy as np

from . import utils


class MoleculeDetails(object):
    
    __slots__ = ['detailFP','scaffoldFP','bitInfoDetailFP','bitInfoScaffoldFP','reactivity','bitReactivity','molecule']
    
    def _atomDetailInvariant(self, mol):
        mol.UpdatePropertyCache(False)
        num_atoms = mol.GetNumAtoms()
        Chem.GetSSSR(mol)
        rinfo = mol.GetRingInfo()
        invariants = [0]*num_atoms
        for i,a in enumerate(mol.GetAtoms()):
            descriptors=[]
            descriptors.append(a.GetAtomicNum())
            descriptors.append(a.GetTotalDegree())
            descriptors.append(a.GetTotalNumHs())
            descriptors.append(rinfo.IsAtomInRingOfSize(a.GetIdx(),6))
            descriptors.append(rinfo.IsAtomInRingOfSize(a.GetIdx(),5))
            descriptors.append(a.IsInRing())
            descriptors.append(a.GetIsAromatic())
            invariants[i]=hash(tuple(descriptors))& 0xffffffff
        return invariants

    def _atomScaffoldInvariant(self, mol):
        num_atoms = mol.GetNumAtoms()
        invariants = [0]*num_atoms
        for i,a in enumerate(mol.GetAtoms()):
            descriptors=[]
            descriptors.append(a.GetAtomicNum())
            invariants[i]=hash(tuple(descriptors))& 0xffffffff
        return invariants    
        
    def _createFP(self, mol, invariant, bitinfo, useBondTypes=True, radius=1):
        return AllChem.GetMorganFingerprint(mol=mol, radius=radius, invariants=invariant, useBondTypes=useBondTypes, bitInfo=bitinfo) 
    
    def _isHeteroAtom(self, a):
        return a.GetAtomicNum() not in (6, 1)

    def _isSp3OrAromaticCarbon(self, a):
        if a.GetAtomicNum() != 6:
            return False
        if a.GetIsAromatic():
            return True
        for b in a.GetBonds():
            if b.GetBondTypeAsDouble() > 1.5:
                return False
        return True

    def _calcReactivityAtom(self, a):
        # exclude sp3 carbons or uncharged single heavy atoms such as water molecules
        if self._isSp3OrAromaticCarbon(a) or (len(a.GetNeighbors())==0 and a.GetFormalCharge()==0):
            return 0
        # all other atoms have at least a reactivity of one
        reactivity=1
        b = a.GetBonds()
        # if it is a heteroatom or has an H (we already know it's not SP3 or aromatic) increase the reactivity 
        if self._isHeteroAtom(a) or a.GetTotalNumHs() > 0:
            reactivity += 1
        # slightly increase reactivity for atoms in aromatic rings compared to aliphatic rings
        if a.IsInRing():
            if a.GetIsAromatic():
                reactivity += 0.5
        # but prefer non-ring atoms
        else:
            reactivity += 1
        # increase reactivity of charged atoms
        if a.GetFormalCharge():
            reactivity += 2
        for bo in b:
            # look at the direct neighbors of the atom
            ni = bo.GetOtherAtom(a)
            # for non-single bonds increase the reactivity
            if bo.GetBondTypeAsDouble() > 1.5:
                reactivity += 1
                # if there are hydrogens attached, increase the reactivity
                if ni.GetTotalNumHs() > 0:
                    reactivity+=1
            # if it is a bond to a hetero atom further increase the reactivity
            if self._isHeteroAtom(ni):
                reactivity += 1
                # bonds between nitrogens and oxygen or between oxygen and oxygen or between nitrogen and nitrogen are more reactive
                if a.GetAtomicNum() in (7,8) and ni.GetAtomicNum() in (7,8):
                    reactivity += 2
                # if the neighbor is a Mg, Si, P, Pd, or Sn atom increase the reactivity 
                elif ni.GetAtomicNum() in (12,14,15,46,50):
                    reactivity += 1
        return reactivity 

    def _calcReactivityMolecule(self, mol):
        reactivityAtoms = [self._calcReactivityAtom(a) for a in mol.GetAtoms()]
        return reactivityAtoms

    def __init__(self, molecule, verbose=0):
        self.molecule= molecule
        self.bitInfoDetailFP={}
        self.detailFP = self._createFP(molecule, self._atomDetailInvariant(molecule), self.bitInfoDetailFP)
        self.bitInfoScaffoldFP={}
        self.scaffoldFP = self._createFP(molecule, self._atomScaffoldInvariant(molecule), self.bitInfoScaffoldFP, useBondTypes=False)
        reactivityAtoms = self._calcReactivityMolecule(molecule)
        reactivity = sum(reactivityAtoms)
        if Chem.MolToSmiles(molecule) in frequentReagents:
            reactivity*=0.8
        self.reactivity = reactivity
        
        
def _calcScore(reactantFP,productFP,bitInfoProd=None,output=False):
    if output:
        print("--- _calcScore ---")
    score=0
    dFP = productFP-reactantFP
    numRBits = float(utils.getNumPositiveCounts(reactantFP))
    if output > 2:
        print("num RBits: ",numRBits)
    numPBits = float(utils.getNumPositiveCounts(productFP))
    if output > 2:
        print("num PBits: ",numPBits)
    numUnmappedPBits = float(utils.getNumPositiveCounts(dFP))
    if output > 2:
        print("num UnmappedPBits: ",numUnmappedPBits)
    numUnmappedRBits = float(utils.getNumNegativeCounts(dFP))
    if output > 2:
        print("num UnmappedRBits: ",numUnmappedRBits)

    numUnmappedPAtoms=-1
    bitsUnmappedPAtoms=-1
    if bitInfoProd is not None:
        numUnmappedPAtoms,bitsUnmappedPAtoms = utils.getNumPositiveBitCountsOfRadius0(dFP,bitInfoProd)
        if output > 2:
            print("num UnmappedPAtoms: ", numUnmappedPAtoms)
    ratioMappedPBits = 1-(numUnmappedPBits/numPBits)
    ratioUnmappedRBits = numUnmappedRBits/numRBits
    score = max(ratioMappedPBits - ratioUnmappedRBits*ratioUnmappedRBits,0)

    if output > 1:
        print("score: ",score, "(",ratioMappedPBits,",",ratioUnmappedRBits*ratioUnmappedRBits,",",ratioUnmappedRBits,")")
        
    return [score,numUnmappedPBits,numUnmappedPAtoms,bitsUnmappedPAtoms]

# Set of frequent reagents derived from all patent reactions
frequentReagents = set(['CCN(CC)CC', '[Li+]', '[Na+]', 'O=C(O)CC(O)(CC(=O)O)C(=O)O', 'O=S(=O)(O)O', 'CN1CCCC1=O', 'CCN(C(C)C)C(C)C',\
 'c1ccncc1', '[K]', 'CC(C)(C)O', 'CCO', 'Cc1ccc(S(=O)(=O)O)cc1', 'ClC(Cl)(Cl)Cl', '[Na]', 'CC(C)(C)[O-]', 'O=C([O-])O', 'COCCOC', '[NH4+]',\
 'CC(C)OC(C)C', 'O=C([O-])[O-]', 'CC(=O)OC(C)=O', 'O=C=O', '[Cl-]', 'c1ccc(P(c2ccccc2)c2ccccc2)cc1', '[H-]', 'N#N', 'CN1CCOCC1',\
 'C1COCCO1', 'c1ccccc1', '[Cs+]', '[K+]', '[OH-]', 'CCCCCC', 'CCCCC', 'CN(C)C=O', 'C[O-]', 'Cc1ccccc1', 'C1CCC2=NCCCN2CC1', 'CO',\
 'CCCCO', 'O=C(O)C(F)(F)F', 'O=P([O-])([O-])[O-]', 'CCOC(C)=O', '[Mg+2]', 'C1CCCCC1', 'O', 'N', 'II', 'O=CO', 'CC(=O)N(C)C', 'CC(=O)O',\
 'CCOCC', 'CC(C)O', 'C[Si](C)(C)Cl', 'Cc1ccccc1C', 'CC(C)=O', 'CS(=O)(=O)O', 'CN(C)c1ccncc1', 'Cl', 'ClCCCl', 'O=S(Cl)Cl', 'ClC(Cl)Cl',\
 '[Li]CCCC', '[Pd]', '[H][H]', '[Br-]', 'CS(C)=O', 'COC(C)(C)C', 'O=S(=O)([O-])[O-]', 'CC(Cl)Cl', 'CC(=O)[O-]', 'CCCC[N+](CCCC)(CCCC)CCCC',\
 'ClCCl', 'CC#N', 'C1CCOC1', 'CCCCCCC'])


def _getBestCombination(rfps,pfps,output=False):

    if output:
        print("--- _getBestCombination ---")

    tests=[]
    numReactants=len(rfps)
    # generate first all reactant combinations
    for i in range(1,numReactants+1):
        for x in itertools.combinations(range(numReactants),i):
            temp=[]
            for j in x:
                # don't include frequent reagents
                if not rfps[j][1]:
                    numAtms = rfps[j][0].molecule.GetNumAtoms()
                    # not test single ions
                    if numAtms > 1:
                        # store the number of reactant atoms for later
                        temp.append((rfps[j][0].molecule.GetNumAtoms(),j))
                else:
                    if output > 3:
                        print("Frequent reagent found: ", j)
            if temp not in tests:
                tests.append(temp)
    # initalisation of the results
    maxScore=0
    maxDetailScore=0
    finalReacts=[[]]
    # get the product fingerprints
    productsDetailFP = utils.getSumFps([i.detailFP for i in pfps])
    productsScaffoldFP = utils.getSumFps([i.scaffoldFP for i in pfps])
    # get the number of atoms for the product
    numProductAtoms = 0
    for i in pfps:
        numProductAtoms += i.molecule.GetNumAtoms() 
    # get the bitinfo for the product FP
    productsDetailFPBitInfo={}
    productsScaffoldFPBitInfo={}
    for i in pfps:
        productsDetailFPBitInfo.update(i.bitInfoDetailFP)
        productsScaffoldFPBitInfo.update(i.bitInfoScaffoldFP)
    # set some initial values    
    numUnmappedPAtoms,bitsUnmappedPAtoms = utils.getNumPositiveBitCountsOfRadius0(productsScaffoldFP,productsScaffoldFPBitInfo)
    finalNumUnmappedProdAtoms=[[len(productsDetailFP.GetNonzeroElements()),\
                                len(productsScaffoldFP.GetNonzeroElements()),numUnmappedPAtoms,bitsUnmappedPAtoms]]

    for test in tests:
        if len(test) < 1:
            continue
        # get the number of involved reactant atoms
        numReactantAtoms = np.array(test)[:,0].sum()
        # ignore combinations including too many or too few atoms
        if numReactantAtoms > 5*numProductAtoms or numReactantAtoms < numProductAtoms*0.8:
            continue
            
        if output > 0:
            print("Combination: ",test)
            
        #build the combined reactant FPs
        reactantsDetailFP = utils.getSumFps([rfps[i[1]][0].detailFP for i in test])
        reactantsScaffoldFP = utils.getSumFps([rfps[i[1]][0].scaffoldFP for i in test])

        # get the scores for both FPs
        detailFPScore = _calcScore(reactantsDetailFP,productsDetailFP,bitInfoProd=productsDetailFPBitInfo,output=output)
        scaffoldFPScore = _calcScore(reactantsScaffoldFP,productsScaffoldFP,bitInfoProd=productsScaffoldFPBitInfo,output=output)
        # final score
        score = detailFPScore[0] + scaffoldFPScore[0]
            
        if output > 0:
            print(">>>> score: ", score)
            print(">>>> scores (detail, scaffold): ", detailFPScore[0], scaffoldFPScore[0])
            print(">>>> num unmapped productFP bits: ", detailFPScore[1], scaffoldFPScore[1], detailFPScore[2], scaffoldFPScore[2])

        if score > maxScore:
            maxScore=score
            maxDetailScore=detailFPScore[0]
            del finalReacts[:]
            del finalNumUnmappedProdAtoms[:]
            # set the final reactants
            finalReacts.append([i[1] for i in test])
            # for tracking the mapping of the product atoms include the number of unmapped detailedFP bits, the number of unmapped 
            # atoms based on the scaffold FP,  the number of unmapped scaffoldFP bits, and the unmapped scaffoldFP bits
            finalNumUnmappedProdAtoms.append([detailFPScore[1], scaffoldFPScore[2], scaffoldFPScore[1],scaffoldFPScore[-1]])
            if output > 0:
                print(" >> maxScore: ", maxScore)
                print(" >> Final reactants: ", finalReacts)
            # test for almost perfect matchings (e.g. oxidations, reduction etc.)
            if scaffoldFPScore[0] > 0.9999 and detailFPScore[0] > 0.8:
                return finalReacts, finalNumUnmappedProdAtoms
            # test for number of mapped product atoms e.g. to capture deprotections ealier 
            if len(finalNumUnmappedProdAtoms) > 0 and len(test) == 1:
                if finalNumUnmappedProdAtoms[0][1] == 0 and finalNumUnmappedProdAtoms[0][0] <= 3:
                    return finalReacts, finalNumUnmappedProdAtoms
        # include alternative solutions
        elif abs(score - maxScore) < 0.0000001 and score > 0.0:
            finalReacts.append([i[1] for i in test])
            finalNumUnmappedProdAtoms.append([detailFPScore[1], scaffoldFPScore[2], scaffoldFPScore[1],scaffoldFPScore[-1]])
            if output > 0:
                print(" >> Added alternative result")
                print(" >> Final reactants: ", finalReacts)
             
    return finalReacts, finalNumUnmappedProdAtoms

def _findMissingReactiveReactants(rfps, pfps, currentReactants, unmappedPAtoms, output=False):
    if output:
        print("--- _findMissingReactiveReactants ---")
    if not len(unmappedPAtoms):
        return currentReactants
    # if there are unmapped product bits find possible reactants for those
    else:
        finalReactants = []
        numReactants=len(rfps)
        # investigate all possible solutions of the scoring before
        for reacts,umPA in zip(currentReactants,unmappedPAtoms):
            # if there are unmapped product atoms find possible reactants for those
            finalReactants.append(reacts)
            if umPA[1] > 0:
                remainingReactants=set(range(numReactants)).difference(set(reacts))
                # sort the possible reactants by the reactivity
                remainingReactants = sorted(remainingReactants, key=lambda x: rfps[x].reactivity/float(rfps[x].molecule.GetNumAtoms()),\
                                            reverse=True)
                missingPAtoms = []
                # get the missing atoms and counts
                for bit,c in umPA[-1]:
                    for pbi in range(len(pfps)):
                        if bit in pfps[pbi].bitInfoScaffoldFP:
                            a = pfps[pbi].bitInfoScaffoldFP[bit][0]
                            missingPAtoms.extend([pfps[pbi].molecule.GetAtomWithIdx(a[0]).GetAtomicNum()]*c)
                missingPAtoms = Counter(missingPAtoms)
                if output >  0:
                    print(missingPAtoms)
                # build queries for the missing atoms
                queries=[(rdqueries.AtomNumEqualsQueryAtom(a),a) for a in missingPAtoms]
                maxFullfilledQueries=0
                maxReactivity=-1
                addReactants=[]
                # search for the most reactive reactants capturing all/most of the unmapped product atoms
                for r in remainingReactants:
                    if output > 0:
                        print(" >> Reactant", r, rfps[r].reactivity/float(rfps[r].molecule.GetNumAtoms()))
                    countFullfilledQueries=0
                    for q,a in queries:
                        if len(rfps[r].molecule.GetAtomsMatchingQuery(q)) >= missingPAtoms[a]:
                            countFullfilledQueries+=1
                    if output > 0:
                        print(" Max reactivity", maxReactivity)
                        print(" Max fullfilled queries", maxFullfilledQueries)
                    if countFullfilledQueries > maxFullfilledQueries:
                        maxFullfilledQueries = countFullfilledQueries
                        maxReactivity = rfps[r].reactivity/float(rfps[r].molecule.GetNumAtoms())
                        addReactants = [r]
                    elif maxFullfilledQueries and countFullfilledQueries == maxFullfilledQueries and \
                         rfps[r].reactivity/float(rfps[r].molecule.GetNumAtoms()) >= maxReactivity:
                        maxFullfilledQueries = countFullfilledQueries
                        addReactants.append(r)
                    if output > 0:
                        print(" Added reactants", addReactants)
                finalReactants[-1].extend(addReactants)
    if output > 0:
        print(" >> Final reactants", finalReactants)
    return finalReactants

def _detectObviousReagents(reactants, products):
    unchangedReacts=set()
    unchangedProds=set()
    for i,r in enumerate(reactants):
        for j,p in enumerate(products):
            if r==p:
                unchangedReacts.add(i)
                unchangedProds.add(j)
    return unchangedReacts,unchangedProds

def identifyReactants(reaction,output=False):
    rxn = AllChem.ChemicalReaction(reaction)
    AllChem.RemoveMappingNumbersFromReactions(rxn)
    if output:
        print("--- identifyReactants ---")
    reactants = rxn.GetReactants()
    products = rxn.GetProducts()
    ### Preprocessing
    uniqueReactants,reactantSmiles = utils.uniqueMolecules(reactants)
    uniqueProducts,productSmiles = utils.uniqueMolecules(products)
    # find molecules which do not change in the rxn
    unmodifiedReactants,unmodifiedProducts = _detectObviousReagents(reactantSmiles, productSmiles)
    if output:
        print("  >>> Found reagents in reactants:", unmodifiedReactants)
        print("  >>> Found reagents in products:", unmodifiedProducts)
    if len(products) == len(unmodifiedProducts):
        unmodifiedProducts=set()
    uniquePotentialReactants = [r for r in sorted(set(uniqueReactants.values()))]
    uniquePotentialProducts = [p for p in sorted(set(uniqueProducts.values())) if p not in unmodifiedProducts]
    
    ### Find the most probable reactants
    # only generate moleculeDetail objects for unique, potential reactants and products
    rfps = [MoleculeDetails(reactants[r]) for r in uniquePotentialReactants]
    pfps = [MoleculeDetails(products[p]) for p in uniquePotentialProducts]
    
    rfpsPrep = [(MoleculeDetails(reactants[r]),reactantSmiles[r] in frequentReagents) for r in uniquePotentialReactants]

    reacts, unmappedProdAtoms = _getBestCombination(rfpsPrep,pfps,output=output)
    # no reactants where found try again including the frequent reagents
    if np.array(reacts).shape == (1,0):
        rfpsPrep = [(MoleculeDetails(reactants[r]),0) for r in uniquePotentialReactants]
        reacts, unmappedProdAtoms = _getBestCombination(rfpsPrep,pfps,output=output)
    
    ### Postprocessing
    # identify missing reactants
    reacts = _findMissingReactiveReactants(rfps, pfps, reacts, unmappedProdAtoms, output=output)
    finalreacts = []
    for i in reacts:
        temp=[uniquePotentialReactants[j] for j in i]
        finalreacts.append(set(temp))
            
    return finalreacts, unmodifiedReactants, unmodifiedProducts

# reassign the reaction roles of a reaction
def reassignRXNRoles(rxn):
    utils.transferAgentsToReactants(rxn)
    reacts, rAgents, pAgents = identifyReactants(rxn)
    if len(reacts) < 1:
        return None
    new_rxn = AllChem.ChemicalReaction()
    for i in range(rxn.GetNumProductTemplates()):
        new_rxn.AddProductTemplate(rxn.GetProductTemplate(i))
    for i in range(rxn.GetNumReactantTemplates()):
        if i in reacts[0]:
            new_rxn.AddReactantTemplate(rxn.GetReactantTemplate(i))
        else:
            new_rxn.AddAgentTemplate(rxn.GetReactantTemplate(i))
    return new_rxn

# clean-up the reaction smiles
def reassignReactionRoles(smi):
    rxn = AllChem.ReactionFromSmarts(smi,useSmiles=True)
    new_rxn = reassignRXNRoles(rxn)
    if new_rxn is None:
        return ''
    smi_new =  AllChem.ReactionToSmiles(new_rxn)
    return smi_new