# Copyright (c) 2013, GlaxoSmithKline Research & Development Ltd.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
#       copyright notice, this list of conditions and the following
#       disclaimer in the documentation and/or other materials provided
#       with the distribution.
#     * Neither the name of GlaxoSmithKline Research & Development Ltd.
#       nor the names of its contributors may be used to endorse or promote
#       products derived from this software without specific prior written
#       permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Created by Jameed Hussain, May 2013
from __future__ import print_function
import sys
from optparse import OptionParser
from rdkit import Chem
from rdkit import DataStructs
from collections import defaultdict

#input format
#query_substructs,query_smiles,SMILES,ID,Tversky_sim

#algorithm
#read in query_substructs and smiles
#feed to atomcontrib function to return generalised_SMILES
#use Tanimoto to compare generalised_SMILES with query smiles to give fraggle similarity

from rdkit.Chem.Fraggle import FraggleSim

parser = OptionParser(
  description="Program to post-process Tversky search results as part of Fraggle",
  epilog="Format of input file: query_frag_smiles,query_smiles,query_id,retrieved_smi,retrieved_id,tversky_sim\t"
  "Output: SMILES,ID,QuerySMI,QueryID,Fraggle_Similarity,RDK5_Similarity")
parser.add_option(
  '-c', '--cutoff', action='store', dest='cutoff', type='float', default=0.7,
  help="Cutoff for fraggle similarity. Only results with similarity greater than the cutoff will be output. DEFAULT = 0.7")
parser.add_option('-p', '--pfp', action='store', dest='pfp', type='float', default=0.8,
                  help="Cutoff for partial fp similarity. DEFAULT = 0.8")

if __name__ == '__main__':
  #parse the command line options
  (options, args) = parser.parse_args()

  if ((options.cutoff >= 0) and (options.cutoff <= 1)):
    fraggle_cutoff = options.cutoff
  else:
    print("Fraggle cutoff must be in range 0-1")
    sys.exit(1)

  print("SMILES,ID,QuerySMI,QueryID,Fraggle_Similarity,RDK5_Similarity")

  #create some data structure to store results
  id_to_smi = {}
  day_sim = {}
  frag_sim = {}
  query_size = {}
  query_mols = {}

  #generate dummy mol object which generates empty fp
  emptyMol = Chem.MolFromSmiles('*')

  #read the STDIN
  for line in sys.stdin:
    line = line.rstrip()
    qSubs, qSmi, qID, inSmi, id_, tversky = line.split(",")

    #add query to id_to_smi
    id_to_smi[qID] = qSmi
    id_to_smi[id_] = inSmi

    #add query to data structures
    frag_sim.setdefault(qID, defaultdict(float))
    day_sim.setdefault(qID, {})

    if (qID not in query_size):
      qMol = Chem.MolFromSmiles(qSmi)
      if (qMol == None):
        sys.stderr.write("Can't generate mol for: %s\n" % (qSmi))
        continue
      query_mols[qID] = qMol
      query_size[qID] = qMol.GetNumAtoms()

    iMol = Chem.MolFromSmiles(inSmi)

    if (iMol == None):
      sys.stderr.write("Can't generate mol for: %s\n" % (inSmi))
      continue

    #discard based on atom size
    if (iMol.GetNumAtoms() < query_size[qID] - 3):
      #sys.stderr.write("Too small: %s\n" % (inSmi) )
      continue

    if (iMol.GetNumAtoms() > query_size[qID] + 4):
      #sys.stderr.write("Too large: %s\n" % (inSmi) )
      continue

    #print '>>>',id_
    rdkit_sim, fraggle_sim = FraggleSim.compute_fraggle_similarity_for_subs(
      iMol, query_mols[qID], qSmi, qSubs, options.pfp)
    day_sim[qID][id_] = rdkit_sim
    frag_sim[qID][id_] = max(frag_sim[qID][id_], fraggle_sim)

    #check if you have the fp for the modified query
    #and generate if need to

  #right, print out the results for the query
  #Format: SMILES,ID,QuerySMI,QueryID,Fraggle_Similarity,Daylight_Similarity
  for qID in frag_sim:
    for id_ in frag_sim[qID]:
      if (frag_sim[qID][id_] >= fraggle_cutoff):
        print("%s,%s,%s,%s,%s,%s" %
              (id_to_smi[id_], id_, id_to_smi[qID], qID, frag_sim[qID][id_], day_sim[qID][id_]))
