File: fingerprint.R

package info (click to toggle)
r-cran-rcdk 3.8.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 904 kB
  • sloc: makefile: 14; sh: 13
file content (191 lines) | stat: -rwxr-xr-x 9,788 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#' Generate molecular fingerprints
#' 
#' `get.fingerprint` returns a `fingerprint` object representing molecular fingerprint of
#' the input molecule.
#' 
#' @param molecule A \code{jobjRef} object to an \code{IAtomContaine}
#' @param type The type of fingerprint. Possible values are:
#'   \itemize{
#' \item standard - Considers paths of a given length. The default is
#' but can be changed. These are hashed fingerprints, with a
#' default length of 1024
#' \item extended - Similar to the standard type, but takes rings and
#' atomic properties into account into account
#' \item graph - Similar to the standard type by simply considers connectivity
#' \item hybridization - Similar to the standard type, but only consider hybridization state
#' \item maccs - The popular 166 bit MACCS keys described by MDL
#' \item estate - 79 bit fingerprints corresponding to the E-State atom types described by Hall and Kier
#' \item pubchem - 881 bit fingerprints defined by PubChem
#' \item kr - 4860 bit fingerprint defined by Klekota and Roth
#' \item shortestpath - A fingerprint based on the shortest paths between pairs of atoms and takes into account ring systems, charges etc.
#' \item signature - A feature,count type of fingerprint, similar in nature to circular fingerprints, but based on the signature 
#' descriptor
#' \item circular - An implementation of the ECFP6 (default) fingerprint. Other circular types can be chosen by modifying the \code{circular.type} parameter.
#' \item substructure - Fingerprint based on list of SMARTS pattern. By default a set of functional groups is tested.
#' }
#' @param fp.mode The style of fingerprint. Specifying "`bit`" will return a binary fingerprint,
#' "`raw`" returns the the original representation (usually sequence of integers) and 
#' "`count`" returns the fingerprint as a sequence of counts.
#' @param depth The search depth. This argument is ignored for the
#' `pubchem`, `maccs`, `kr` and `estate` fingerprints
#' @param size The final length of the fingerprint. 
#' This argument is ignored for the `pubchem`, `maccs`, `kr`, `signature`, `circular` and 
#' `estate` fingerprints
#' @param substructure.pattern List of characters containing the SMARTS pattern to match. If the an empty list is provided (default) than the functional groups substructures (default in CDK) are used.
#' @param circular.type Name of the circular fingerprint type that should be computed given as string. Possible values are: 'ECFP0', 'ECFP2', 'ECFP4', 'ECFP6' (default), 'FCFP0', 'FCFP2', 'FCFP4' and 'FCFP6'.
#' @param verbose Verbose output if \code{TRUE}
#' @return an S4 object of class \code{\link{fingerprint-class}} or \code{\link{featvec-class}}, 
#' which can be manipulated with the fingerprint package.
#' @export
#' @author Rajarshi Guha (\email{rajarshi.guha@@gmail.com})
#' @examples 
#' ## get some molecules
#' sp <- get.smiles.parser()
#' smiles <- c('CCC', 'CCN', 'CCN(C)(C)', 'c1ccccc1Cc1ccccc1','C1CCC1CC(CN(C)(C))CC(=O)CC')
#' mols <- parse.smiles(smiles)
#' 
#' ## get a single fingerprint using the standard
#' ## (hashed, path based) fingerprinter
#' fp <- get.fingerprint(mols[[1]])
#' 
#' ## get MACCS keys for all the molecules
#' fps <- lapply(mols, get.fingerprint, type='maccs')
#' 
#' ## get Signature fingerprint
#' ## feature, count fingerprinter
#' fps <- lapply(mols, get.fingerprint, type='signature', fp.mode='raw')
#' ## get Substructure fingerprint for functional group fragments
#' fps <- lapply(mols, get.fingerprint, type='substructure')
#' 
#' ## get Substructure count fingerprint for user defined fragments
#' mol1 <- parse.smiles("c1ccccc1CCC")[[1]]
#' smarts <- c("c1ccccc1", "[CX4H3][#6]", "[CX2]#[CX2]")
#' fps <- get.fingerprint(mol1, type='substructure', fp.mode='count',
#'     substructure.pattern=smarts)
#' 
#' ## get ECFP0 count fingerprints 
#' mol2 <- parse.smiles("C1=CC=CC(=C1)CCCC2=CC=CC=C2")[[1]]
#' fps <- get.fingerprint(mol2, type='circular', fp.mode='count', circular.type='ECFP0')
get.fingerprint <- function(molecule, type = 'standard', fp.mode = 'bit', depth=6, size=1024, substructure.pattern=character(), circular.type = "ECFP6", verbose=FALSE) {
  if (is.null(attr(molecule, 'jclass'))) stop("Must supply an IAtomContainer or something coercable to it")
  if (attr(molecule, "jclass") != "org/openscience/cdk/interfaces/IAtomContainer") {
    ## try casting it
    molecule <- .jcast(molecule, "org/openscience/cdk/interfaces/IAtomContainer")
  }

  mode(size) <- 'integer'
  mode(depth) <- 'integer'
  
  # Determine integer ID for the circular fingerprint given its desired type.
  # This allows us to use also ECFP4, ... 
  if (type == 'circular') {
        circular.type.id <- switch(circular.type, 
             ECFP0 = 1, ECFP2 = 2, ECFP4 = 3, ECFP6 = 4,
             FCFP0 = 5, FCFP2 = 6, FCFP4 = 7, FCFP6 = 8,
             NULL)
        
        if (is.null(circular.type.id)) stop(paste('Invalid circular fingerprint type: ', circular.type))
        
        mode(circular.type.id) <- 'integer'
  }

  fingerprinter <-
    switch(type,
           standard = .jnew('org/openscience/cdk/fingerprint/Fingerprinter', size, depth),
           extended = .jnew('org/openscience/cdk/fingerprint/ExtendedFingerprinter', size, depth),
           graph = .jnew('org/openscience/cdk/fingerprint/GraphOnlyFingerprinter', size, depth),
           maccs = .jnew('org/openscience/cdk/fingerprint/MACCSFingerprinter'),
           pubchem = .jnew('org/openscience/cdk/fingerprint/PubchemFingerprinter', get.chem.object.builder()),
           estate = .jnew('org/openscience/cdk/fingerprint/EStateFingerprinter'),
           hybridization = .jnew('org/openscience/cdk/fingerprint/HybridizationFingerprinter', size, depth),
           lingo = .jnew('org/openscience/cdk/fingerprint/LingoFingerprinter', depth),
           kr = .jnew('org/openscience/cdk/fingerprint/KlekotaRothFingerprinter'),
           shortestpath = .jnew('org/openscience/cdk/fingerprint/ShortestPathFingerprinter', size),
           signature = .jnew('org/openscience/cdk/fingerprint/SignatureFingerprinter', depth),
           circular = .jnew('org/openscience/cdk/fingerprint/CircularFingerprinter', circular.type.id),
           substructure = 
               if (length(substructure.pattern) == 0) 
                   # Loads the default group substructures
                   { .jnew('org/openscience/cdk/fingerprint/SubstructureFingerprinter') }
               else
                   # Loads the substructures defined by the user
                   { .jnew('org/openscience/cdk/fingerprint/SubstructureFingerprinter', .jarray(substructure.pattern)) },
           )
  if (is.null(fingerprinter)) stop("Invalid fingerprint type specified")

  jfp <- NA
  if (fp.mode == 'bit') {
    jfp <- .jcall(fingerprinter,
                  "Lorg/openscience/cdk/fingerprint/IBitFingerprint;",
                  "getBitFingerprint", molecule, check=FALSE)
  } else if (fp.mode == 'raw') {
    jfp <- .jcall(fingerprinter,
                  "Ljava/util/Map;",
                  "getRawFingerprint", molecule, check=FALSE)
  } else if (fp.mode == 'count') {
    jfp <- .jcall(fingerprinter,
                  "Lorg/openscience/cdk/fingerprint/ICountFingerprint;",
                  "getCountFingerprint", molecule, check=FALSE)    
  }
  
  e <- .jgetEx()
  if (.jcheck(silent=TRUE)) { 
    if (verbose) print(e)
    return(NULL)
  }

  moltitle <- get.property(molecule, 'Title')
  if (is.na(moltitle)) moltitle <- ''

  if (fp.mode == 'bit') {
    bitset <- .jcall(jfp, "Ljava/util/BitSet;", "asBitSet")
    
    if (type == 'maccs') nbit <- 166
    else if (type == 'estate') nbit <- 79
    else if (type == 'pubchem') nbit <- 881
    else if (type == 'kr') nbit <- 4860
    else if (type == 'substructure') nbit <- .jcall(fingerprinter, "I", "getSize")
    else if (type == 'circular') nbit <- .jcall(fingerprinter, "I", "getSize")
    else nbit <- size
    
    bitset <- .jcall(bitset, "S", "toString")
    s <- gsub('[{}]','', bitset)
    s <- strsplit(s, split=',')[[1]]
    return(new("fingerprint", nbit=nbit, bits=as.numeric(s)+1, provider="CDK", name=moltitle))
  } else if (fp.mode == 'raw') {
    keySet <- .jcall(jfp, "Ljava/util/Set;", method="keySet")
    size <- .jcall(jfp, "I", method="size")
    if (size == 0) return(new('featvec', provider='CDK', name=moltitle))
    keyIter <- .jcall(keySet, "Ljava/util/Iterator;", method="iterator")
    keys <- list()
    for (i in 1:size) {

      tempkey <- .jcall(keyIter, "Ljava/lang/Object;", method="next")

      if ('jobjRef' %in% class(tempkey))  {
        tempkey <- .jsimplify(tempkey)
      }
      keys[[i]] <- tempkey
    
    }

    values <- list()
    for (i in 1:length(keys)) {
      tmp <- .jcall(jfp, "Ljava/lang/Object;", "get", .jcast(new(J("java/lang/String"),keys[[i]]),"java/lang/Object") )
      values[[i]] <- .jsimplify(tmp)
    }
    ## now make our features and create the featvec fp
    if (length(keys) != length(values))
      stop("Feature length did not match count length. This is a problem")
    features <- lapply(1:length(keys), function(i) new("feature", feature=keys[[i]], count=as.integer(values[[i]])))
    return(new("featvec", features=features, provider="CDK", name=moltitle))
  } else if (fp.mode == 'count') {
    fpsize <- .jcall(jfp, "J", "size")
    nbin <- .jcall(jfp, "I", "numOfPopulatedbins")
    ## get hash values
    hvals <- sapply(0:(nbin-1), function(i) .jcall(jfp, "I", "getHash", as.integer(i)) )
    cvals <- sapply(0:(nbin-1), function(i) .jcall(jfp, "I", "getCount", as.integer(i)) )
    features <- lapply(1:length(hvals), function(i) new("feature", feature=as.character(hvals[i]), count=as.integer(cvals[i])))
    return(new("featvec", features=features, provider="CDK", name=moltitle))
  }
}