File: Utils.cpp

package info (click to toggle)
rdkit 201203-3
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 37,840 kB
  • sloc: cpp: 93,902; python: 51,897; java: 5,192; ansic: 3,497; xml: 2,499; sql: 1,641; yacc: 1,518; lex: 1,076; makefile: 325; fortran: 183; sh: 153; cs: 51
file content (221 lines) | stat: -rw-r--r-- 8,673 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
// $Id: Utils.cpp 1625 2011-01-13 04:22:56Z glandrum $
//
// Copyright (c) 2002-20`0  greg Landrum, Rational Discovery LLC
//
//  @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#include "BitVects.h"
#include "BitVectUtils.h"
#include <RDGeneral/Invariant.h>
#include <iostream>

//! Convert a SparseBitVector to an ExplicitBitVector
ExplicitBitVect *convertToExplicit(const SparseBitVect *sbv) {
  unsigned int sl = sbv->getNumBits();
  ExplicitBitVect *ebv = new ExplicitBitVect(sl);
  const IntSet *bset = sbv->getBitSet();
  for (IntSetConstIter it = bset->begin(); it != bset->end(); it++) {
    ebv->setBit(*it);
  }
  return ebv;
}

void a2b(const char *,char *);

//! \brief Construct a BitVect from the ASCII representation of a
//! Daylight fingerprint string
template <typename T>
void FromDaylightString(T &sbv,std::string s)
{
  sbv.clearBits();
  int length = s.length();
  int nBits;

  if(s[length-1] == '\n') length -= 1;

  // 4 bytes in the ascii correspond to 3 bytes in the binary
  //  plus there's one extra ascii byte for the pad marker
  length -= 1;
  nBits = (3*length/4)*8;
  
  switch(s[length]){
  case '1': nBits -= 16;break;
  case '2': nBits -= 8;break;
  case '3': break;
  default:  throw "ValueError bad daylight fingerprint string";
  }
  int i=0,nBitsDone=0;
  while(i < length){
    char bytes[3];
    a2b(s.c_str()+i,bytes);
    for(int j=0;j<3 && nBitsDone < nBits;j++){
      unsigned char query=0x80;
      for(int k=0;k<8;k++) {
        if(bytes[j]&query){
          sbv.setBit(nBitsDone);
        }
        query >>= 1;
        nBitsDone++;
      }
    }
    i += 4;
  }
}

template void FromDaylightString(SparseBitVect &sbv,std::string s);
template void FromDaylightString(ExplicitBitVect &sbv,std::string s);

//! \brief Construct a BitVect from the ASCII representation of a
//! BitString
template <typename T>
void FromBitString(T &sbv,const std::string &s)
{
  PRECONDITION(s.length()<=sbv.getNumBits(),"bad bitvect length");
  sbv.clearBits();
  for(unsigned int i=0;i<sbv.getNumBits();++i){
    if(s[i]=='1') sbv.setBit(i);
  }
}

template void FromBitString(SparseBitVect &sbv,const std::string &s);
template void FromBitString(ExplicitBitVect &sbv,const std::string &s);


//! converts 4 ascii bytes at a4 to 3 binary bytes
/*!
 THE FOLLOWING IS TAKEN FROM THE DAYLIGHT CONTRIB PROGRAM
   ascii2bits.c
*********************************************************************
*** a2b - converts 4 ascii bytes at a4 to 3 binary
***       bytes at b3.
***
***  ASCII:    |=======+=======+=======+=======| etc.
***                                            ^
***    becomes...                      3  <->  4
***                                    v        
***  BINARY:   |=====+=====+=====+=====| etc.
********************************************************************
*/
void a2b(const char *a4, char *b3)
{
  int i;
  char byte=0x00, b=0x00;

  /*********************************************
  *** Use the Daylight mapping to convert each
  *** ascii char to its 6-bit code.
  ***
  *** a4: |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx (printable)
  ***     |=======+=======+=======+=======|
  ***   becomes...
  *** a4: |00xxxxxx00xxxxxx00xxxxxx00xxxxxx
  ***     |=======+=======+=======+=======|
  *********************************************/
  for (i = 0; i < 4; ++i) {
    switch (a4[i]) {
      case '.': byte = 0x00; break;      /* 00 = __000000 */
      case '+': byte = 0x01; break;      /* 01 = __000001 */
      case '0': byte = 0x02; break;      /* 02 = __000010 */
      case '1': byte = 0x03; break;      /* 03 = __000011 */
      case '2': byte = 0x04; break;      /* 04 = __000100 */
      case '3': byte = 0x05; break;      /* 05 = __000101 */
      case '4': byte = 0x06; break;      /* 06 = __000110 */
      case '5': byte = 0x07; break;      /* 07 = __000111 */
      case '6': byte = 0x08; break;      /* 08 = __001000 */
      case '7': byte = 0x09; break;      /* 09 = __001001 */
      case '8': byte = 0x0a; break;      /* 10 = __001010 */
      case '9': byte = 0x0b; break;      /* 11 = __001011 */
      case 'A': byte = 0x0c; break;      /* 12 = __001100 */
      case 'B': byte = 0x0d; break;      /* 13 = __001101 */
      case 'C': byte = 0x0e; break;      /* 14 = __001110 */
      case 'D': byte = 0x0f; break;      /* 15 = __001111 */
      case 'E': byte = 0x10; break;      /* 16 = __010000 */
      case 'F': byte = 0x11; break;      /* 17 = __010001 */
      case 'G': byte = 0x12; break;      /* 18 = __010010 */
      case 'H': byte = 0x13; break;      /* 19 = __010011 */
      case 'I': byte = 0x14; break;      /* 20 = __010100 */
      case 'J': byte = 0x15; break;      /* 21 = __010101 */
      case 'K': byte = 0x16; break;      /* 22 = __010110 */
      case 'L': byte = 0x17; break;      /* 23 = __010111 */
      case 'M': byte = 0x18; break;      /* 24 = __011000 */
      case 'N': byte = 0x19; break;      /* 25 = __011001 */
      case 'O': byte = 0x1a; break;      /* 26 = __011010 */
      case 'P': byte = 0x1b; break;      /* 27 = __011011 */
      case 'Q': byte = 0x1c; break;      /* 28 = __011100 */
      case 'R': byte = 0x1d; break;      /* 29 = __011101 */
      case 'S': byte = 0x1e; break;      /* 30 = __011110 */
      case 'T': byte = 0x1f; break;      /* 31 = __011111 */
      case 'U': byte = 0x20; break;      /* 32 = __100000 */
      case 'V': byte = 0x21; break;      /* 33 = __100001 */
      case 'W': byte = 0x22; break;      /* 34 = __100010 */
      case 'X': byte = 0x23; break;      /* 35 = __100011 */
      case 'Y': byte = 0x24; break;      /* 36 = __100100 */
      case 'Z': byte = 0x25; break;      /* 37 = __100101 */
      case 'a': byte = 0x26; break;      /* 38 = __100110 */
      case 'b': byte = 0x27; break;      /* 39 = __100111 */
      case 'c': byte = 0x28; break;      /* 40 = __101000 */
      case 'd': byte = 0x29; break;      /* 41 = __101001 */
      case 'e': byte = 0x2a; break;      /* 42 = __101010 */
      case 'f': byte = 0x2b; break;      /* 43 = __101011 */
      case 'g': byte = 0x2c; break;      /* 44 = __101100 */
      case 'h': byte = 0x2d; break;      /* 45 = __101101 */
      case 'i': byte = 0x2e; break;      /* 46 = __101110 */
      case 'j': byte = 0x2f; break;      /* 47 = __101111 */
      case 'k': byte = 0x30; break;      /* 48 = __110000 */
      case 'l': byte = 0x31; break;      /* 49 = __110001 */
      case 'm': byte = 0x32; break;      /* 50 = __110010 */
      case 'n': byte = 0x33; break;      /* 51 = __110011 */
      case 'o': byte = 0x34; break;      /* 52 = __110100 */
      case 'p': byte = 0x35; break;      /* 53 = __110101 */
      case 'q': byte = 0x36; break;      /* 54 = __110110 */
      case 'r': byte = 0x37; break;      /* 55 = __110111 */
      case 's': byte = 0x38; break;      /* 56 = __111000 */
      case 't': byte = 0x39; break;      /* 57 = __111001 */
      case 'u': byte = 0x3a; break;      /* 58 = __111010 */
      case 'v': byte = 0x3b; break;      /* 59 = __111011 */
      case 'w': byte = 0x3c; break;      /* 60 = __111100 */
      case 'x': byte = 0x3d; break;      /* 61 = __111101 */
      case 'y': byte = 0x3e; break;      /* 62 = __111110 */
      case 'z': byte = 0x3f; break;      /* 63 = __111111 */
    }

    /*********************************************
    *** Now copy the 4x6=24 bits from a4 to b3. 
    ***
    *** a4: |--000000--111111--222222--333333
    ***     |=======+=======+=======+=======|
    ***
    *** b3: |000000111111222222333333
    ***     |=====+=====+=====+=====|
    *********************************************/
    if (i == 0)
      b3[0] = (byte << 2);             /*** 6 bits into 1st byte ***/
    else if (i == 1) {
      b3[0] |= ((b = byte) >> 4);      /*** 2 bits into 1st byte ***/
      b3[1] = ((b = byte) << 4);       /*** 4 bits into 2nd byte ***/
    } else if (i == 2) {
      b3[1] |= ((b = byte) >> 2);      /*** 4 bits into 2nd byte ***/
      b3[2] = ((b = byte) << 6);       /*** 2 bits into 3rd byte ***/
    } else if (i == 3)
      b3[2] |= byte;                   /*** 6 bits into 3rd byte ***/
  }
  return;
}


// Demo Data:
// 256 bits:
//.b7HEa..ccc+gWEIr89.8lV8gOF3aXFFR.+Ps.mZ6lg.2
//
// 00000010 01110010 01010011 01000010 01100000
// 00000000 10100010 10001010 00000001 10110010
// 00100100 00010100 11011100 10100010 11000000
// 00101011 00011000 01001010 10110001 10100100
// 01000101 10011010 00110100 01010001 01110100
// 00000000 01011011 11100000 00001100 10100101
// 00100011 00011011