File: indexing.py

package info (click to toggle)
rdkit 201809.1%2Bdfsg-6
links: PTS, VCS
area: main
in suites: buster
size: 123,688 kB
sloc: cpp: 230,509; python: 70,501; java: 6,329; ansic: 5,427; sql: 1,899; yacc: 1,739; lex: 1,243; makefile: 445; xml: 229; fortran: 183; sh: 123; cs: 93
file content (530 lines) | stat: -rw-r--r-- 18,918 bytes
# Copyright (c) 2012, GlaxoSmithKline Research & Development Ltd.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
#       copyright notice, this list of conditions and the following
#       disclaimer in the documentation and/or other materials provided
#       with the distribution.
#     * Neither the name of GlaxoSmithKline Research & Development Ltd.
#       nor the names of its contributors may be used to endorse or promote
#       products derived from this software without specific prior written
#       permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Created by Jameed Hussain, September 2012
from __future__ import print_function
import sys
import re
from rdkit import Chem
from optparse import OptionParser


def heavy_atom_count(smi):

  m = Chem.MolFromSmiles(smi)
  return m.GetNumAtoms()


def add_to_index(smi, attachments, cmpd_heavy):

  result = False
  core_size = heavy_atom_count(smi) - attachments

  if (use_ratio):
    core_ratio = float(core_size) / float(cmpd_heavy)
    if (core_ratio <= ratio):
      result = True
  else:
    if (core_size <= max_size):
      result = True

  return result


def get_symmetry_class(smi):
  symmetry = []

  m = Chem.MolFromSmiles(smi)
  symmetry_classes = Chem.CanonicalRankAtoms(m, breakTies=False)

  #get the symmetry class of the attachements points
  #Note: 1st star is the zero index,
  #2nd star is first index, etc
  for atom, symmetry_class in zip(m.GetAtoms(), symmetry_classes):
    if (atom.GetMass() == 0):
      symmetry.append(symmetry_class)

  return symmetry


def cansmirk(lhs, rhs, context):

  #cansmirk algorithm
  #1) cansmi the LHS.
  #2) For the LHS the 1st star will have label 1, 2nd star will have label 2 and so on
  #3) Do a symmetry check of lhs and rhs and use that to decide if the labels on
  #   RHS or/and context need to change.
  #4) For the rhs, if you have a choice (ie. two attachement points are symmetrically
  #   equivalent), always put the label with lower numerical value on the earlier
  #   attachement point on the cansmi-ed smiles

  #print "in: %s,%s" % (lhs,rhs)

  isotope_track = {}
  #if the star count of lhs/context/rhs is 1, single cut
  stars = lhs.count("*")

  if (stars > 1):
    #get the symmetry class of stars of lhs and rhs
    lhs_sym = get_symmetry_class(lhs)
    rhs_sym = get_symmetry_class(rhs)

  #deal with double cuts
  if (stars == 2):
    #simple cases
    #unsymmetric lhs and unsymmetric rhs
    if ((lhs_sym[0] != lhs_sym[1]) and (rhs_sym[0] != rhs_sym[1])):
      #get 1st and 2nd labels and store the new label for it in isotope_track
      #structure: isotope_track[old_label]=new_label (as strings)
      isotope_track = build_track_dictionary(lhs, stars)

      #switch labels using isotope track
      lhs = switch_labels_on_position(lhs)
      rhs = switch_labels(isotope_track, stars, rhs)
      context = switch_labels(isotope_track, stars, context)

    #symmetric lhs and symmetric rhs
    elif ((lhs_sym[0] == lhs_sym[1]) and (rhs_sym[0] == rhs_sym[1])):
      #the points are all equivalent so change labels on lhs and rhs based on position
      #labels on context don't need to change
      lhs = switch_labels_on_position(lhs)
      rhs = switch_labels_on_position(rhs)

    #more difficult cases..
    #symmetric lhs and unsymmetric rhs
    elif ((lhs_sym[0] == lhs_sym[1]) and (rhs_sym[0] != rhs_sym[1])):
      #switch labels lhs based on position
      lhs = switch_labels_on_position(lhs)
      #change labels on rhs based on position but need to record
      #the changes as need to appy them to the context
      isotope_track = build_track_dictionary(rhs, stars)
      rhs = switch_labels_on_position(rhs)
      context = switch_labels(isotope_track, stars, context)

    #unsymmetric lhs and symmetric rhs
    elif ((lhs_sym[0] != lhs_sym[1]) and (rhs_sym[0] == rhs_sym[1])):
      #change labels on lhs based on position but need to record
      #the changes as need to appy them to the context
      isotope_track = build_track_dictionary(lhs, stars)
      lhs = switch_labels_on_position(lhs)
      context = switch_labels(isotope_track, stars, context)
      #as rhs is symmetric, positions are equivalent so change labels on position
      rhs = switch_labels_on_position(rhs)

    #deal with triple cut
    #unwieldy code but most readable I can make it
  elif (stars == 3):
    #simple cases
    #completely symmetric lhs and completely symmetric rhs
    if (((lhs_sym[0] == lhs_sym[1]) and (lhs_sym[1] == lhs_sym[2]) and
         (lhs_sym[0] == lhs_sym[2])) and
        ((rhs_sym[0] == rhs_sym[1]) and (rhs_sym[1] == rhs_sym[2]) and (rhs_sym[0] == rhs_sym[2]))):
      #the points are all equivalent so change labels on lhs and rhs based on position
      #labels on context don't need to change
      lhs = switch_labels_on_position(lhs)
      rhs = switch_labels_on_position(rhs)

    #completely symmetric lhs and completely unsymmetric rhs
    elif (
      ((lhs_sym[0] == lhs_sym[1]) and (lhs_sym[1] == lhs_sym[2]) and (lhs_sym[0] == lhs_sym[2])) and
      ((rhs_sym[0] != rhs_sym[1]) and (rhs_sym[1] != rhs_sym[2]) and (rhs_sym[0] != rhs_sym[2]))):

      #alter lhs in usual way
      lhs = switch_labels_on_position(lhs)
      #change labels on rhs based on position but need to record
      #the changes as need to appy them to the context
      isotope_track = build_track_dictionary(rhs, stars)
      rhs = switch_labels_on_position(rhs)
      context = switch_labels(isotope_track, stars, context)

    #completely unsymmetric lhs and completely unsymmetric rhs
    elif (
      ((lhs_sym[0] != lhs_sym[1]) and (lhs_sym[1] != lhs_sym[2]) and (lhs_sym[0] != lhs_sym[2])) and
      ((rhs_sym[0] != rhs_sym[1]) and (rhs_sym[1] != rhs_sym[2]) and (rhs_sym[0] != rhs_sym[2]))):

      #build the isotope track
      isotope_track = build_track_dictionary(lhs, stars)
      #alter lhs in usual way
      lhs = switch_labels_on_position(lhs)
      #change rhs and context based on isotope_track
      rhs = switch_labels(isotope_track, stars, rhs)
      context = switch_labels(isotope_track, stars, context)

    #completely unsymmetric lhs and completely symmetric rhs
    elif (
      ((lhs_sym[0] != lhs_sym[1]) and (lhs_sym[1] != lhs_sym[2]) and (lhs_sym[0] != lhs_sym[2])) and
      ((rhs_sym[0] == rhs_sym[1]) and (rhs_sym[1] == rhs_sym[2]) and (rhs_sym[0] == rhs_sym[2]))):

      #build isotope trach on lhs
      isotope_track = build_track_dictionary(lhs, stars)
      #alter lhs in usual way
      lhs = switch_labels_on_position(lhs)
      #change labels on context
      context = switch_labels(isotope_track, stars, context)
      #all positions on rhs equivalent so add labels on position
      rhs = switch_labels_on_position(rhs)

    #more difficult cases, partial symmetry
    #completely unsymmetric on lhs and partial symmetry on rhs
    elif ((lhs_sym[0] != lhs_sym[1]) and (lhs_sym[1] != lhs_sym[2]) and (lhs_sym[0] != lhs_sym[2])):
      #build the isotope track
      isotope_track = build_track_dictionary(lhs, stars)
      #alter lhs in usual way
      lhs = switch_labels_on_position(lhs)
      #change rhs and context based on isotope_track
      rhs = switch_labels(isotope_track, stars, rhs)
      context = switch_labels(isotope_track, stars, context)

      #tweak positions on rhs based on symmetry
      #rhs 1,2 equivalent
      if (rhs_sym[0] == rhs_sym[1]):
        #tweak rhs position 1 and 2 as they are symmetric
        rhs = switch_specific_labels_on_symmetry(rhs, rhs_sym, 1, 2)

      #rhs 2,3 equivalent
      elif (rhs_sym[1] == rhs_sym[2]):
        #tweak rhs position 1 and 2 as they are symmetric
        rhs = switch_specific_labels_on_symmetry(rhs, rhs_sym, 2, 3)

      #rhs 1,3 equivalent - try for larger set in future
      elif (rhs_sym[0] == rhs_sym[2]):
        #tweak rhs position 1 and 2 as they are symmetric
        rhs = switch_specific_labels_on_symmetry(rhs, rhs_sym, 1, 3)

      #now we are left with things with partial symmetry on lhs and not completely symmetric or unsymmetric on rhs
    else:
      #lhs 1,2,3 equivalent and any sort of partial symmetry on rhs
      if ((lhs_sym[0] == lhs_sym[1]) and (lhs_sym[1] == lhs_sym[2]) and (lhs_sym[0] == lhs_sym[2])):

        #alter lhs in usual way
        lhs = switch_labels_on_position(lhs)
        #change labels on rhs based on position but need to record
        #the changes as need to appy them to the context
        isotope_track = build_track_dictionary(rhs, stars)
        rhs = switch_labels_on_position(rhs)
        context = switch_labels(isotope_track, stars, context)

      #now deal partial symmetry on lhs or rhs.
      #Cases where:
      #lhs 1,2 equivalent
      #lhs 2,3 equivalent
      #lhs 1,3 equivalent
      else:
        #build isotope track on lhs
        isotope_track = build_track_dictionary(lhs, stars)
        #alter lhs in usual way
        lhs = switch_labels_on_position(lhs)
        #change rhs and context based on isotope_track
        rhs = switch_labels(isotope_track, stars, rhs)
        context = switch_labels(isotope_track, stars, context)

        #tweak positions on rhs based on symmetry

        #lhs 1,2 equivalent
        if (lhs_sym[0] == lhs_sym[1]):
          #tweak rhs position 1 and 2 as they are symmetric on lhs
          rhs = switch_specific_labels_on_symmetry(rhs, rhs_sym, 1, 2)

        #lhs 2,3 equivalent
        elif (lhs_sym[1] == lhs_sym[2]):
          #tweak rhs position 1 and 2 as they are symmetric on lhs
          rhs = switch_specific_labels_on_symmetry(rhs, rhs_sym, 2, 3)

        #lhs 1,3 equivalent - try for larger set in future
        elif (lhs_sym[0] == lhs_sym[2]):
          #tweak rhs position 1 and 2 as they are symmetric on lhs
          rhs = switch_specific_labels_on_symmetry(rhs, rhs_sym, 1, 3)

  smirk = "%s>>%s" % (lhs, rhs)

  return smirk, context


def switch_specific_labels_on_symmetry(smi, symmetry_class, a, b):

  #check if a and b positions are symmetrically equivalent
  #if equivalent, swap labels if the lower numerical label is not on the
  #1st symmetrically equivalent attachment points in the smi

  if (symmetry_class[a - 1] == symmetry_class[b - 1]):
    #what are the labels on a and b

    matchObj = re.search(r'\[\*\:([123])\].*\[\*\:([123])\].*\[\*\:([123])\]', smi)
    if matchObj:
      #if the higher label comes first, fix
      if (int(matchObj.group(a)) > int(matchObj.group(b))):
        #if(int(matchObj.group(1)) > int(matchObj.group(2))):
        smi = re.sub(r'\[\*\:' + matchObj.group(a) + '\]', '[*:XX' + matchObj.group(b) + 'XX]', smi)
        smi = re.sub(r'\[\*\:' + matchObj.group(b) + '\]', '[*:XX' + matchObj.group(a) + 'XX]', smi)
        smi = re.sub('XX', '', smi)

  return smi


def switch_labels_on_position(smi):

  #move the labels in order of position
  smi = re.sub(r'\[\*\:[123]\]', '[*:XX1XX]', smi, 1)
  smi = re.sub(r'\[\*\:[123]\]', '[*:XX2XX]', smi, 1)
  smi = re.sub(r'\[\*\:[123]\]', '[*:XX3XX]', smi, 1)
  smi = re.sub('XX', '', smi)

  return smi


def switch_labels(track, stars, smi):

  #switch labels based on the input dictionary track
  if (stars > 1):
    #for k in track:
    #        print "old: %s, new: %s" % (k,track[k])

    if (track['1'] != '1'):
      smi = re.sub(r'\[\*\:1\]', '[*:XX' + track['1'] + 'XX]', smi)

    if (track['2'] != '2'):
      smi = re.sub(r'\[\*\:2\]', '[*:XX' + track['2'] + 'XX]', smi)

    if (stars == 3):
      if (track['3'] != '3'):
        smi = re.sub(r'\[\*\:3\]', '[*:XX' + track['3'] + 'XX]', smi)

    #now remove the XX
    smi = re.sub('XX', '', smi)

  return smi


def build_track_dictionary(smi, stars):

  isotope_track = {}

  #find 1st label, record it in isotope_track as key, with value being the
  #new label based on its position (1st star is 1, 2nd star 2 etc.)
  if (stars == 2):
    matchObj = re.search(r'\[\*\:([123])\].*\[\*\:([123])\]', smi)
    if matchObj:
      isotope_track[matchObj.group(1)] = '1'
      isotope_track[matchObj.group(2)] = '2'

  elif (stars == 3):
    matchObj = re.search(r'\[\*\:([123])\].*\[\*\:([123])\].*\[\*\:([123])\]', smi)
    if matchObj:
      isotope_track[matchObj.group(1)] = '1'
      isotope_track[matchObj.group(2)] = '2'
      isotope_track[matchObj.group(3)] = '3'

  return isotope_track


def index_hydrogen_change():
  #Algorithm details
  #have an index of common fragment(key) => fragments conected to it (values)
  #Need to add *-H to the values where appropriate - and its
  #appropriate when the key is what you would get if you chopped a H off a cmpd.
  #Therefore simply need to check if key with the * replaced with a H is
  #the same as any full smiles in the set
  #
  #Specific details:
  #1) Loop through keys of index
  #2) If key is the result of a single cut (so contains only 1 *) replace the * with H, and cansmi
  #3) If full smiles matches key in hash above, add *-H to that fragment index.

  for key in index:

    attachments = key.count('*')
    #print attachments

    if (attachments == 1):

      smi = key

      #simple method
      smi = re.sub(r'\[\*\:1\]', '[H]', smi)

      #now cansmi it
      temp = Chem.MolFromSmiles(smi)

      if (temp == None):
        sys.stderr.write('Error with key: %s, Added H: %s\n' % (key, smi))
      else:
        c_smi = Chem.MolToSmiles(temp, isomericSmiles=True)

        if (c_smi in smi_to_id):
          core = "[*:1][H]"
          id = smi_to_id[c_smi]

          value = "%s;t%s" % (id, core)
          #add to index
          index[key].append(value)


if __name__ == '__main__':

  #note max heavy atom count does not
  #include the attachement points (*)
  max_size = 10
  ratio = 0.3
  use_ratio = False

  index = {}
  smi_to_id = {}
  id_to_smi = {}

  id_to_heavy = {}

  #set up the command line options
  #parser = OptionParser()
  parser = OptionParser(description="Program to generate MMPs")
  parser.add_option(
    '-s', '--symmetric', default=False, action='store_true', dest='sym',
    help='Output symmetrically equivalent MMPs, i.e output both cmpd1,cmpd2, SMIRKS:A>>B and cmpd2,cmpd1, SMIRKS:B>>A')
  parser.add_option(
    '-m', '--maxsize', action='store', dest='maxsize', type='int',
    help='Maximum size of change (in heavy atoms) allowed in matched molecular pairs identified. DEFAULT=10. \
                      Note: This option overrides the ratio option if both are specified.')
  parser.add_option(
    '-r', '--ratio', action='store', dest='ratio', type='float',
    help='Maximum ratio of change allowed in matched molecular pairs identified. The ratio is: size of change / \
                      size of cmpd (in terms of heavy atoms). DEFAULT=0.3. Note: If this option is used with the maxsize option, the maxsize option will be used.')

  #parse the command line options
  (options, args) = parser.parse_args()

  #print options
  if (options.maxsize != None):
    max_size = options.maxsize
  elif (options.ratio != None):
    ratio = options.ratio
    if (ratio >= 1):
      print("Ratio specified: %s. Ratio needs to be less than 1.")
      sys.exit(1)
    use_ratio = True

  #read the STDIN
  for line in sys.stdin:

    line = line.rstrip()
    smi, id, core, context = line.split(',')

    #fill in dictionaries
    smi_to_id[smi] = id
    id_to_smi[id] = smi

    #if using the ratio option, check if heavy atom
    #of mol already calculated. If not, calculate and store
    cmpd_heavy = None
    if (use_ratio):
      if ((id in id_to_heavy) == False):
        id_to_heavy[id] = heavy_atom_count(smi)

      cmpd_heavy = id_to_heavy[id]

    #deal with cmpds that have not been fragmented
    if (len(core) == 0) and (len(context) == 0):
      continue

    #deal with single cuts
    if (len(core) == 0):
      side_chains = context.split('.')

      #minus 1 for the attachement pt
      if (add_to_index(side_chains[1], 1, cmpd_heavy) == True):
        context = side_chains[0]
        core = side_chains[1]

        value = "%s;t%s" % (id, core)

        #add the array if no key exists
        #add the context with id to index
        index.setdefault(context, []).append(value)

      #minus 1 for the attachement pt
      if (add_to_index(side_chains[0], 1, cmpd_heavy) == True):
        context = side_chains[1]
        core = side_chains[0]

        value = "%s;t%s" % (id, core)

        #add the array if no key exists
        #add the context with id to index
        index.setdefault(context, []).append(value)

    #double or triple cut
    else:

      attachments = core.count('*')

      if (add_to_index(core, attachments, cmpd_heavy) == True):
        value = "%s;t%s" % (id, core)

        #add the array if no key exists
        #add the context with id to index
        index.setdefault(context, []).append(value)

  #index the H change
  index_hydrogen_change()

  #Now index is ready

  #loop through the index
  for key in index:

    total = len(index[key])

    #check if have more than one value
    if (total == 1):
      continue

    for xa in range(total):

      for xb in range(xa, total):

        if (xa != xb):
          #now generate the pairs

          id_a, core_a = index[key][xa].split(";t")
          id_b, core_b = index[key][xb].split(";t")

          #make sure pairs are not same molecule
          if (id_a != id_b):

            #make sure LHS and RHS of SMIRKS are not the same
            if (core_a != core_b):

              smirks, context = cansmirk(core_a, core_b, key)
              print("%s,%s,%s,%s,%s,%s" %
                    (id_to_smi[id_a], id_to_smi[id_b], id_a, id_b, smirks, context))

              #deal with symmetry switch
              if (options.sym == True):
                smirks, context = cansmirk(core_b, core_a, key)
                print("%s,%s,%s,%s,%s,%s" %
                      (id_to_smi[id_b], id_to_smi[id_a], id_b, id_a, smirks, context))