File: finger3.cpp

package info (click to toggle)
openbabel 2.2.3-1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 36,644 kB
  • ctags: 33,717
  • sloc: cpp: 242,528; ansic: 87,037; sh: 10,280; perl: 5,518; python: 5,156; pascal: 793; makefile: 747; cs: 392; xml: 97; ruby: 54; java: 23
file content (318 lines) | stat: -rw-r--r-- 10,300 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
/**********************************************************************
finger3.cpp: Fingerprints based on list of SMARTS patterns
Copyright (C) 2005 Chris Morley
 
This file is part of the Open Babel project.
For more information, see <http://openbabel.sourceforge.net/>
 
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
***********************************************************************/

#include <openbabel/babelconfig.h>
#include <openbabel/mol.h>

#include <sstream>
#include <fstream>
#include <map>
#include <string>

#include <openbabel/fingerprint.h>

using namespace std;
namespace OpenBabel
{
/// \brief Fingerprint based on list of SMARTS patterns
class PatternFP  : public OBFingerprint
{
private:
  struct pattern
  {
    string smartsstring;
    OBSmartsPattern obsmarts;
    string description;
    int numbits;
    int numoccurrences;
    int bitindex;
  };
  vector<pattern> _pats;
  int _bitcount;

protected:
  string _patternsfile;

public:
  PatternFP(const char* ID, const char* filename=NULL, 
      bool IsDefault=false) : OBFingerprint(ID, IsDefault)
  {
    if(filename==NULL)
      _patternsfile="patterns.txt";
    else
      _patternsfile = filename;
  }

///////////////////////////////////////////////////////////////////////////// 
  virtual const char* Description()
  {
    static string desc;
    desc = "SMARTS patterns specified in the file " + _patternsfile
      + "\nPatternFP is definable";
    return (desc.c_str());
  }

//////////////////////////////////////////////////////////////////////////////
  //Each bit represents a single substructure; no need for confirmation when substructure searching
  virtual unsigned int Flags() { return FPT_UNIQUEBITS;}; 

///////////////////////////////////////////////////////////////////////////////
  virtual PatternFP* MakeInstance(const std::vector<std::string>& textlines)
  {
    return new PatternFP(textlines[1].c_str(),textlines[2].c_str());
  }

//////////////////////////////////////////////////////////////////////////////// 
  virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>&fp, int foldbits) 
  {
    OBMol* pmol = dynamic_cast<OBMol*>(pOb);
    if(!pmol)
      return false;
    
    unsigned int n;
    //Read patterns file if it has not been done already
    if(_pats.empty())
      ReadPatternFile();

    //Make fp size the smallest power of two to contain the patterns
    n=Getbitsperint();
    while(n < _bitcount)
      n*=2;
    fp.resize(n/Getbitsperint());

    n=0; //bit position
    vector<pattern>::iterator ppat;
    for(ppat=_pats.begin();ppat!=_pats.end();++ppat)
    {
      if(ppat->numbits //ignore pattern if numbits==0
        && ppat->obsmarts.Match(*pmol, ppat->numoccurrences==0))//do single match if all that's needed
      {
        /* Set bits in the fingerprint depending on the number of matches in the molecule
           and the parameters, numbits and numoccurrences, in the pattern.
           The pattern will set or clear numbits bits in the fingerprint.
           They will be in numoccurrences+1 groups, each containing an approximately
           equal number of bits.
           The first group of bits will be set if numMatches > numoccurences;
           The second group will be set if numMatches > numoccurrences - 1;
           and so on.
           So with a pattern with numbits = 4 and numoccurences = 2,
           the groups would be 1, 1, and 2 bits.
           A molecule with
              1 match to the pattern would give 0011 
              2 matches to the pattern would give 0111 
              3 or more matches to the pattern would give 1111 
        */
        int numMatches = ppat->obsmarts.GetUMapList().size();       
        int num =  ppat->numbits, div = ppat->numoccurrences+1, ngrp;
 
        int i = n;
        while(num)
        {
          ngrp = (num -1)/div-- +1; //rounds up
          num -= ngrp;
          while(ngrp--)
            if (numMatches > div) {
              SetBit(fp,i);
            }
          i++;
        }
      }
      n += ppat->numbits;
    }

    if(foldbits)
      Fold(fp, foldbits);
    return true;
  }
  
  /////////////////////////////////////////////////////////////////////
  bool ReadPatternFile()
  {
    //Reads three types of file. See below
    ifstream ifs;
	  stringstream errorMsg; 

    if (OpenDatafile(ifs, _patternsfile).length() == 0)
    {
      errorMsg << "Cannot open " << _patternsfile << endl;
      obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
      return false;
    }

    string line;
    if(!getline(ifs, line)) //first line
      return false;
    bool smartsfirst = (Trim(line)=="#Comments after SMARTS");
    
    _bitcount=0;
    do
    {
      if(Trim(line).size()>0 && line[0]!='#')
      {
        pattern p;
        p.numbits=1; p.numoccurrences=0; //default values
        p.bitindex = _bitcount;
        istringstream ss(line);

        if(smartsfirst)
        {
          if(isdigit(line[0]))
          {
            if(!ParseRDKitFormat(ss, p))
              continue;
          }
          else
            //Original format, which looks like:
            //  SMARTS description
            ss >> p.smartsstring >> p.description;
        }
        else
        {
          // Christian Laggner's format:
          //  description: SMARTS [occurrences [numbits]]
          getline(ss, p.description, ':');
          ss >> p.smartsstring;
          ss >> p.numoccurrences >> p.numbits;
        }

        if(!p.obsmarts.Init(p.smartsstring))
        {
          obErrorLog.ThrowError(__FUNCTION__,
            "Faulty SMARTS: " + p.description + ' ' + p.smartsstring, obError);
          continue;
        }
        _pats.push_back(p);
        _bitcount += p.numbits;
      }
    }while(getline(ifs,line));
 
    if (ifs)
      ifs.close();
    return true;
  }

///////////////////////////////////////////////////////////////////////////////
  virtual string DescribeBits(const vector<unsigned int> fp, bool bSet=true)
  {
    //checkmol-type output with tab separated functional group names
    stringstream ss;
    vector<pattern>::iterator ppat;
    for(ppat=_pats.begin();ppat!=_pats.end();++ppat)
    {
      int n = ppat->bitindex;
      int num =  ppat->numbits, div = ppat->numoccurrences+1, ngrp;
      while(num) //for each group of bits
      {
        ngrp = (num + div -1)/div--; //rounds up
        num -= ngrp;
        if(GetBit(fp, n) == bSet)
        {
          ss << '\t' << ppat->description;
          if(div>0)
            ss << '*' << div+1;
          break; //ignore the bits signifying a smaller number of occurrences
        }
        n += ngrp;
      }
    }
    ss << endl;
    return ss.str();
  }

///////////////////////////////////////////////////////////////////////////////////
  bool ParseRDKitFormat(istringstream& ss, pattern& p) 
  {
    //rdkit format, e.g.
    //  14:('[S,s]-[S,s]',0), # S-S
    const int dum = 20; //an arbitrary number in case delimiters in ignore statements not found
    string number, comment;
    getline(ss, number, ':');
    ss.ignore(dum, '\'');
    getline(ss, p.smartsstring, '\'');
    if(p.smartsstring[0]=='?') //ignore patterns with SMARTS '?'
      p.smartsstring="[999]";//this seems to match nothing;  was return false;
    ss.ignore(dum,',');
    ss >> p.numoccurrences;
    ss.ignore(dum,'#');
    getline(ss, comment);

    //description is number + edited commment
    Trim(comment);
    string::size_type pos;
    pos = comment.find("FIX");
    if(pos==string::npos)
      pos = comment.find("*NOTE*");
    if(pos!=string::npos)
      comment.erase(pos);
    p.description = number + ": " + comment;
    return true;
  }


}; //class PatternFP

//***********************************************
//Make a global instance
PatternFP FP3PatternFP("FP3");

PatternFP FP4PatternFP("FP4", "SMARTS_InteLigand.txt");
//***********************************************

/*! \class PatternFP
A bit is set when there is a match to one of a list
of SMARTS patterns in the datafile, which is specified in the constructor.
If no filename is given, the default filename is patterns.txt.
Fingerprints can be made by declaring a global variable, as in:

PatternFP FP4PatternFP("FP4", "SMARTS_InteLigand.txt");

Alternatively, an entry in plugindefines.txt like:

PatternFP
MACCS          #ID of this fingerprint type
MACCS.txt      #File containing the SMARTS patterns

defines a fingerprint without the need to recompile.

Three file formats are supported:
 - the preferred format (e.g. SMARTS_InteLigand.txt in FP4)
 - the original format (patterns.txt has an incomplete set of SMARTS patterns)
 - a format made by extracting from an RDKit file (MACCS.txt)
The last two require the first line to be:
#Comments after SMARTS

Lines starting with # are ignored.
For the preferred format each line is of the form:
description: SMARTS [occurrences [numbits]]
A bit is set in the fingerprint for ach SMARTS pattern matched.
The optional integer parameters refine this behaviour; the most obvious uses are:
 - if <occurrences> is present and greater than its default value of 0, the bit
   is set only if the number of matches to the pattern is greater than <occurences>.
 - if <occurences> is 0 and <numbits> is greater than its default value of 1, then
   the fingerprint has <numbits> bits set if there is a match. This gives greater weight
   to the pattern for use in similarity measures like Tanimoto.
 - if the parameters are n-1 and n and the number of matches is n,
   a bit is set for each of the conditions n>=m, n>=m-1, ... , n>=1
   This can be used to distinguish structures with many similar atoms like n-alkanes.
The use of other values for the parameters, which can be any positive integer, can give
other analogous behaviours. If numbits is 0 the pattern is ignored.
*/

}//namespace

//! \file finger3.cpp
//! \brief fingerprints based on list of SMARTS patterns