File: fastcluster.py

package info (click to toggle)
rdkit 202009.4-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 129,624 kB
  • sloc: cpp: 288,030; python: 75,571; java: 6,999; ansic: 5,481; sql: 1,968; yacc: 1,842; lex: 1,254; makefile: 572; javascript: 461; xml: 229; fortran: 183; sh: 134; cs: 93
file content (67 lines) | stat: -rw-r--r-- 2,364 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#  Original Author: iwatobipen
#
#  This file is part of the RDKit.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the RDKit source tree.
"""
This script performs fast clustering of SMILES

Clustering method is repeated bi section, the method looks like -k-means.
To use this script, the user needs to install bayon at first.
Input format: Tab separated SMILES strings (SMILES \t molID \n ...)
Please see more details in README.

"""

import argparse
import subprocess
import pickle
import os
from rdkit import Chem
from rdkit.Chem import AllChem

def getArgParser():
    """ Create the argument parser """
    parser = argparse.ArgumentParser("Fast clustering for chemoinformatics")
    parser.add_argument("input", help="filename of input file")
    parser.add_argument("nclusters", metavar="N", help="the number of clusters")
    parser.add_argument("--output",  help="filename of output, tab separated format", default="clustered.tsv")
    parser.add_argument("--centroid", metavar="CENTROID",  help="filename of centroid information. tab separated format", default="centroid.tsv")
    return parser

def smi2fp(molid, smiles):
    mol = Chem.MolFromSmiles(smiles)
    onbits = AllChem.GetMorganFingerprintAsBitVect(mol, 2).GetOnBits()
    row = molid
    for bit in onbits:
        row += "\tFP_{}\t1.0".format(bit)
    row += "\n" 
    return row 


if __name__ == "__main__":
    parser = getArgParser()
    args = parser.parse_args()
    with open(args.input, "r") as inputf:
        with open("fp.tsv", "w") as tempf:
            for line in inputf:
                molid,smiles = line.rstrip().split("\t")
                tempf.write(smi2fp(molid, smiles))
    res = subprocess.call("time bayon -p -c {0.centroid} -n  {0.nclusters} fp.tsv > {0.output}".format(args), shell=True)

    #parse results
    parsefile = open(args.output.split(".")[0]+"_parse.tsv", "w")
    inputf = open(args.output, "r")
    for line in inputf:
        line = line.rstrip().split("\t")
        cluster_id = line[0]
        for i in range(1, len(line)-1, 2) :
            molid = line[ i ]
            point = line[ i + 1 ]
            parsefile.write("{}\t{}\tCLS_ID_{}\n".format(molid, point, cluster_id))
    parsefile.close()


    if res != 0:
        parser.exit("Error running bayon")