File: pyCIPLabelsValidation.py

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (106 lines) | stat: -rw-r--r-- 2,574 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# coding: utf-8
import os
import sys

from rdkit import Chem

EXPECTED_LABELS_OVERRIDES = {
  # p/m labels are filtered out on input, so they are not
  # included here

  # allene-likes
  'VS063': ['26R', '2S'],
  'VS118': [],
  'VS135': [],
  'VS154': ['6E', '7E'],
  'VS164': ['21R'],
  'VS231': [],

  # Chiralities not flagged by RDKit
  'VS132': [],
}


def split_label_string(labels):
  """
    Splits the string of expected labels and filters out helical
    labels, which are not supported by rdkit
    """
  return {label for label in labels.split() if label[-1] not in 'mpMP'}


def supplier(fname):
  """
    Read the smiles, name and expected labels from the input file.
    If we know of an override for the expected labels, we do the
    override here.
    """
  with open(fname) as f:
    for line in f:
      smiles, name, expected, _ = line.split('\t', 3)

      try:
        expected = set(EXPECTED_LABELS_OVERRIDES[name])
      except KeyError:
        expected = split_label_string(expected)

      mol = Chem.MolFromSmiles(smiles, sanitize=False)
      Chem.SanitizeMol(mol)
      Chem.SetBondStereoFromDirections(mol)

      yield mol, name, expected


def getLabels(mol):
  """
    Calculates and extracts the CIP labels for the mol
    """

  Chem.rdCIPLabeler.AssignCIPLabels(mol)

  labels = set()
  for atom in mol.GetAtoms():
    try:
      label = atom.GetProp('_CIPCode')
    except KeyError:
      continue
    else:
      atom_idx = atom.GetIdx() + 1
      labels.add(f'{atom_idx}{label}')

  for bond in mol.GetBonds():
    try:
      label = bond.GetProp('_CIPCode')
    except KeyError:
      continue
    else:
      begin_idx = bond.GetBeginAtomIdx() + 1
      end_idx = bond.GetEndAtomIdx() + 1
      labels.add(f'{begin_idx}{label}')
      labels.add(f'{end_idx}{label}')

  return labels


if __name__ == '__main__':
  # The structures used for the validation come from
  # https://github.com/CIPValidationSuite/ValidationSuite
  # at commit 28d0fe05073905e74a1ba5e06b3bd6298686f6af
  fname = 'compounds.smi'
  fpath = os.path.join(os.environ['RDBASE'], 'Code', 'GraphMol', 'test_data', fname)

  failed = []
  for mol, name, expected in supplier(fpath):
    actual = getLabels(mol)

    if actual == expected:
      print(f'{name}: PASSED')
    else:
      print(f'{name}: FAILED')
      print(f'    Expected: {sorted(expected)}')
      print(f'    Actual:   {sorted(actual)}')
      failed.append(name)
  print(f'Check finished: {len(failed)} molecules failed.')
  print(f'Failed: {", ".join(failed)}')

  sys.exit(len(failed) > 0)