File: molecule_fingerprint.h

package info (click to toggle)
indigo 1.2.3-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 28,256 kB
  • sloc: ansic: 309,316; cpp: 137,636; cs: 9,118; asm: 8,011; java: 7,195; sql: 6,697; xml: 4,352; python: 3,426; sh: 207; php: 56; makefile: 49
file content (182 lines) | stat: -rw-r--r-- 6,539 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/****************************************************************************
 * Copyright (C) 2009-2015 EPAM Systems
 * 
 * This file is part of Indigo toolkit.
 * 
 * This file may be distributed and/or modified under the terms of the
 * GNU General Public License version 3 as published by the Free Software
 * Foundation and appearing in the file LICENSE.GPL included in the
 * packaging of this file.
 * 
 * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
 * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 ***************************************************************************/

#ifndef __molecule_fingerprint__
#define __molecule_fingerprint__

#include "base_cpp/tlscont.h"
#include "base_cpp/obj.h"
#include "molecule/base_molecule.h"
#include "base_cpp/cancellation_handler.h"
#include "graph/subgraph_hash.h"

#include <unordered_map>
#include <limits.h>

#ifdef _WIN32
#pragma warning(push)
#pragma warning(disable:4251)
#endif

namespace indigo {

class TautomerSuperStructure;

// Fingerprint consists of 5 parts: EXT + ORD + ANY + TAU + SIM.
// EXT is always 3 bytes long, other parts' sizes are configured.
// ORD, ANY, and SIM parts are build up from fragments.
// Each fragments goes to:
//    SIM -- as long as it has no query atoms/bonds and is small enough
//    ORD -- as long as it has no query atoms/bonds
//    ANY (with bond types discarded) -- as long as it has no query atoms
//    ANY (with atom types discarded) -- as long as it has no query bonds
//    ANY (with atom and bond types discarded) -- always
// TAU part is build up from a 'supermolecule' having some added bonds,
//     and with all bond types discarded
// EXT part is build up from some element, isotope, and charge counters

struct MoleculeFingerprintParameters
{
   bool ext;
   int ord_qwords, any_qwords, tau_qwords, sim_qwords;

   int fingerprintSize    () const { return (ext ? 3 : 0) + (ord_qwords + any_qwords + tau_qwords + sim_qwords) * 8; }
   int fingerprintSizeExt () const { return (ext ? 3 : 0); }
   int fingerprintSizeOrd () const { return ord_qwords * 8; }
   int fingerprintSizeSim () const { return sim_qwords * 8; }
   int fingerprintSizeTau () const { return tau_qwords * 8; }
   int fingerprintSizeAny () const { return any_qwords * 8; }

   int fingerprintSizeExtOrd () const { return (ext ? 3 : 0) + ord_qwords * 8;}
   int fingerprintSizeExtOrdSim () const { return (ext ? 3 : 0) + ord_qwords * 8 + sim_qwords * 8;}

};

class DLLEXPORT MoleculeFingerprintBuilder
{
public:
   MoleculeFingerprintBuilder (BaseMolecule &mol, const MoleculeFingerprintParameters &parameters);
   ~MoleculeFingerprintBuilder ();

   bool query;

   bool skip_ord; // don't build 'ordinary' part of the fingerprint
   bool skip_sim; // don't build 'similarity' part of the fingerprint
   bool skip_tau; // don't build 'tautomer' part of the fingerprint
   bool skip_ext; // don't build 'extra' part of the fingerprint
   bool skip_ext_charge; // don't store information about charges in 'extra' part

   bool skip_any_atoms; // don't build 'any atoms' part of the fingerprint
   bool skip_any_bonds; // don't build 'any bonds' part of the fingerprint
   bool skip_any_atoms_bonds; // don't build 'any atoms, any bonds' part of the fingerprint

   void process ();

   const byte * get ();
   byte * getOrd ();
   byte * getSim ();
   byte * getTau ();
   byte * getAny ();
   
   int countBits_Sim ();

   void (*cb_fragment) (BaseMolecule &mol, const Array<int> &vertices, const Array<int> &edges,
                        bool use_atoms, bool use_bonds, dword hash);

   void parseFingerprintType(const char *type, bool query);

   CancellationHandler* cancellation;

   DECL_ERROR;

protected:
   void _initHashCalculations (BaseMolecule &mol, const Filter &vfilter);

   static void _handleTree     (Graph &graph, const Array<int> &vertices, const Array<int> &edges, void *context);
   static bool _handleCycle    (Graph &graph, const Array<int> &vertices, const Array<int> &edges, void *context);

   int _atomCode (BaseMolecule &mol, int vertex_idx);
   int _bondCode (BaseMolecule &mol, int edge_idx);

   static int _maximalSubgraphCriteriaValue (Graph &graph, const Array<int> &vertices, const Array<int> &edges, void *context);

   void _handleSubgraph (Graph &graph, const Array<int> &vertices, const Array<int> &edges);

   dword _canonicalizeFragment (BaseMolecule &mol, const Array<int> &vertices, const Array<int> &edges,
      bool use_atoms, bool use_bonds, int *different_vertex_count);

   void _canonicalizeFragmentAndSetBits (BaseMolecule &mol, const Array<int> &vertices, const Array<int> &edges,
      bool use_atoms, bool use_bonds, int subgraph_type, dword &bits_to_set);

   void _makeFingerprint (BaseMolecule &mol);
   void _calcExtraBits (BaseMolecule &mol);

   void _setTauBits (const char *str, int nbits);
   void _setOrdBits (const char *str, int nbits);

   static void _setBits (dword hash, byte *fp, int size, int nbits);
   
   void _calculateFragmentVertexDegree (BaseMolecule &mol, const Array<int> &vertices, const Array<int> &edges);
   int _calculateFragmentExternalConn (BaseMolecule &mol, const Array<int> &vertices, const Array<int> &edges);

   BaseMolecule &_mol;
   const MoleculeFingerprintParameters &_parameters;

   // these parameters are indirectly passed to the callbacks
   TautomerSuperStructure *_tau_super_structure;
   bool _is_cycle;

   struct HashBits
   {
      HashBits (dword hash, int bits_per_fragment);
      bool operator== (const HashBits &right) const;

      dword hash;
      int bits_per_fragment;
   };
   struct Hasher
   {
      size_t operator () (const HashBits &input) const;
   };

   void _addOrdHashBits (dword hash, int bits_per_fragment);

   Obj<SubgraphHash> subgraph_hash;

   CP_DECL;
   TL_CP_DECL(Array<byte>, _total_fingerprint);
   TL_CP_DECL(Array<int>, _atom_codes);
   TL_CP_DECL(Array<int>, _bond_codes);
   TL_CP_DECL(Array<int>, _atom_codes_empty);
   TL_CP_DECL(Array<int>, _bond_codes_empty);
   TL_CP_DECL(Array<int>, _atom_hydrogens);
   TL_CP_DECL(Array<int>, _atom_charges);
   TL_CP_DECL(Array<int>, _vertex_connectivity);
   TL_CP_DECL(Array<int>, _fragment_vertex_degree);
   TL_CP_DECL(Array<int>, _bond_orders);

   typedef std::unordered_map<HashBits, int, Hasher> HashesMap;
   TL_CP_DECL(HashesMap, _ord_hashes);

private:
   MoleculeFingerprintBuilder (const MoleculeFingerprintBuilder &); // no implicit copy
};

}

#ifdef _WIN32
#pragma warning(pop)
#endif

#endif