File: clustering.py

package info (click to toggle)
cryptominisat 5.11.4%2Bdfsg1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 4,432 kB
  • sloc: cpp: 55,148; ansic: 9,642; python: 8,899; sh: 1,336; php: 477; sql: 403; javascript: 173; xml: 34; makefile: 15
file content (386 lines) | stat: -rwxr-xr-x 13,425 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2009-2020 Authors of CryptoMiniSat, see AUTHORS file
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 2
# of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.

import operator
import re
import pandas as pd
import pickle
import sklearn
import sklearn.svm
import sklearn.tree
import sklearn.cluster
from sklearn.preprocessing import StandardScaler
import argparse
import sys
import numpy as np
import sklearn.metrics
import time
import itertools
import math
import matplotlib.pyplot as plt
import sklearn.ensemble
import os
import helper
import decimal
try:
    from termcolor import cprint
except ImportError:
    termcolor_avail = False
else:
    termcolor_avail = True

ver = sklearn.__version__.split(".")
if int(ver[1]) < 20:
    from sklearn.cross_validation import train_test_split
else:
    from sklearn.model_selection import train_test_split


CLUST_TYPE_FILES = 0
CLUST_TYPE_USEFULNESS = 1

def get_cluster_name(clust_type):
    name = None
    if clust_type == CLUST_TYPE_FILES:
        name = "clust_f"
    elif clust_type == CLUST_TYPE_USEFULNESS:
        name = "clust_u"
    else:
        assert False

    return name

def get_clustering_label(df, feats_used: list, scaler, clustering):
    this_feats = list(df)
    for f in feats_used:
        if f not in this_feats:
            print("ERROR: Feature '%s' is in the training file but not in the current one!" % f)
            exit(-1)

    df_clust = df[feats_used].astype(float).copy()
    df_clust = scaler.transform(df_clust)
    return clustering.predict(df_clust)


def dump_dataframe(df, fname: str):
    with open(fname, "wb") as f:
        pickle.dump(df, f)
    print("Dumped to file %s" % fname)


class Clustering:
    def __init__(self, df, clust_type):
        self.df = df
        self.clust_type = clust_type

    def reformat_feats(self, feats):
        feats_clean = []
        for feat in feats:
            assert "szfeat_cur.conflicts" not in feat
            c = str(feat)
            c = c.replace(".irred_", ".irred_cl_distrib.")
            c = c.replace(".red_", ".red_cl_distrib.")
            c = c.replace("szfeat_cur.", "{val}.")

            feats_clean.append(c)
        assert len(feats_clean) == len(feats)

        return feats_clean

    def create_code_for_cluster_centers(self, clust, scaler, sz_feats):
        sz_feats_clean = self.reformat_feats(sz_feats)
        f = open("{basedir}/clustering_imp.cpp".format(basedir=options.basedir), 'w')

        helper.write_mit_header(f)
        f.write("""
#include "satzilla_features.h"
#include "clustering.h"
#include <cmath>

using namespace CMSat;

ClusteringImp::ClusteringImp() {{
    set_up_centers();
}}

ClusteringImp::~ClusteringImp() {{
}}

""".format(clusters=options.clusters))

        f.write("void ClusteringImp::set_up_centers() {\n")
        f.write("\n        centers.resize(%d);\n" % options.clusters)
        for i in range(options.clusters):
            f.write("\n        // Doing cluster center %d\n" % i)
            f.write("\n        centers[%d].resize(%d);\n" % (i, len(sz_feats_clean)) )
            f.write("\n        used_clusters.push_back(%d);\n" % i)
            for i2 in range(len(sz_feats_clean)):
                feat = sz_feats_clean[i2]
                center = clust.cluster_centers_[i][i2]
                f.write("        centers[{num}][{feat}] = {center}L;\n".format(
                    num=i, feat=i2, center=center))

        f.write("    }\n")

        f.write("""
double sq(double x) {
    return x*x;
}

double ClusteringImp::norm_dist(const SatZillaFeatures& a, const std::vector<double>& center) const {
    double dist = 0;
    double tmp;
""")
        for feat, i in zip(sz_feats_clean, range(10000)):
            printed_scale = "{scale:3.10f}".format(scale=scaler.scale_[i])
            if decimal.Decimal(printed_scale) == decimal.Decimal("0.0") :
                    f.write("         // feature {feat} would have caused a division by zero, avoiding\n".\
                        format(feat=feat.format(val="a")))
                    continue

            f.write("        tmp = ((double){feat}-{mean:3.9f})/{printed_scale};\n".format(
                feat=feat.format(val="a"),
                 mean=scaler.mean_[i],
                 printed_scale=printed_scale
                 ))

            f.write("    dist+=sq(tmp-center[{feat}]);\n\n".format(
                feat=i
                ))

        f.write("""
    return dist;
}\n""")

        f.write("""
int ClusteringImp::which_is_closest(const SatZillaFeatures& p) const {
    double closest_dist = numeric_limits<double>::max();
    int closest = -1;
    for (int i: used_clusters) {
        double dist = norm_dist(p, centers[i]);
        if (dist < closest_dist) {
            closest_dist = dist;
            closest = i;
        }
    }
    return closest;
    }\n""")

    def select_features_files(self):
        features = list(self.df)
        for f in list(features):
            if any(ext in f for ext in ["var", "vcg", "pnr", "min", "max", "std"]):
                features.remove(f)

        # features from dataframe
        feats_used = []
        for feat in features:
            if "szfeat_cur" in feat and "szfeat_cur.conflicts" not in feat:
                feats_used.append(feat)

        return feats_used

    def select_features_usefulness(self):
        features = list(self.df)
        feats_used = helper.get_features(options.best_features_fname)

        return feats_used

    def cluster(self):
        if self.clust_type == CLUST_TYPE_FILES:
            self.feats_used = self.select_features_files()
        elif self.clust_type == CLUST_TYPE_USEFULNESS:
            self.feats_used = self.select_features_usefulness()
        else:
            assert False

        print("Features used: ", self.feats_used)
        print("Number of features used: ", len(self.feats_used))

        if options.check_row_data:
            helper.check_too_large_or_nan_values(self.df, self.feats_used)
        else:
            helper.check_too_large_or_nan_values(self.df.sample(100), self.feats_used)

        # fit to slice that only includes CNF features
        df_clust = self.df[self.feats_used].astype(float).copy()
        if options.scale:
            self.scaler = StandardScaler()
            self.scaler.fit(df_clust)
            if options.verbose:
                print("Scaler:")
                print(" -- ", self.scaler.mean_)
                print(" -- ", self.scaler.scale_)

            if options.verbose:
                df_clust_back = df_clust.copy()
            df_clust = self.scaler.transform(df_clust)
        else:
            class ScalerNone:
                def __init__(self):
                    self.mean_ = [0.0 for n in range(df_clust.shape[1])]
                    self.scale_ = [1.0 for n in range(df_clust.shape[1])]
            self.scaler = ScalerNone()

        # test scaler's code generation
        if options.scale and options.verbose:
            # we rely on this later in code generation
            # for scaler.mean_
            # for scaler.scale_
            # for cluster.cluster_centers_
            for i in range(df_clust_back.shape[1]):
                assert df_clust_back.columns[i] == self.feats_used[i]

            # checking that scaler works as expected
            for feat in range(df_clust_back.shape[1]):
                df_clust_back[df_clust_back.columns[feat]
                              ] -= self.scaler.mean_[feat]
                df_clust_back[df_clust_back.columns[feat]
                              ] /= self.scaler.scale_[feat]

            print(df_clust_back.head()-df_clust.head())
            print(df_clust_back.head())
            print(df_clust.head())

        self.clust = sklearn.cluster.KMeans(n_clusters=options.clusters, random_state=prng)
        self.clust.fit(df_clust)
        self.df["clust"] = self.clust.labels_

        # print information about the clusters
        if options.verbose:
            print(self.feats_used)
            print(self.clust.labels_)
            print(self.clust.cluster_centers_)
            print(self.clust.get_params())

        # code for cluster centers
        self.create_code_for_cluster_centers(
            self.clust, self.scaler, self.feats_used)

        return self.feats_used, self.scaler, self.clust


if __name__ == "__main__":
    usage = "usage: %(prog)s [options] file.pandas"
    parser = argparse.ArgumentParser(usage=usage)

    parser.add_argument("--seed", default=None, type=int,
                        dest="seed", help="Seed of PRNG")
    parser.add_argument("fnames", nargs='+', metavar='FILES',
                        help="Files to parse, first file is the one to base data off of")
    parser.add_argument("--verbose", "-v", action="store_true", default=False,
                        dest="verbose", help="Print more output")
    parser.add_argument("--printfeat", action="store_true", default=False,
                        dest="print_features", help="Print features")
    parser.add_argument("--check", action="store_true", default=False,
                        dest="check_row_data", help="Check row data for NaN or float overflow")
    parser.add_argument("--bestfeatfile",
                        default="../../scripts/crystal/best_features_usefulness_clust.txt",
                        type=str,
                        dest="best_features_fname",
                        help="Name and position of best features file that lists the best features in order")

    # clustering
    parser.add_argument("--clusters", default=4, type=int,
                        dest="clusters", help="How many clusters to use")
    parser.add_argument("--scale", default=False, action="store_true",
                        dest="scale", help="Scale clustering")
    parser.add_argument("--nocomputed", default=True, action="store_false",
                        dest="computed", help="Don't add computed features")
    parser.add_argument("--samples", default=1000,
                        dest="samples_per_file", help="Samples per file")

    # number of configs to generate
    parser.add_argument("--confs", default="2-2", type=str,
                        dest="confs", help="Configs to generate")

    # code
    parser.add_argument("--basedir", type=str,
                        dest="basedir", help="The base directory of where the CryptoMiniSat source code is")

    options = parser.parse_args()
    prng = np.random.RandomState(options.seed)

    if options.basedir is None:
        print("ERROR: must set basedir")
        exit(-1)

    if options.fnames is None or len(options.fnames) == 0:
        print("ERROR: You must give the pandas file!")
        exit(-1)

    fnames = [f for f in options.fnames if "clust" not in f]
    print("Will add clustering to files: ")
    for f in fnames:
        print("->", f)

    if options.clusters <= 0:
        print("ERROR: You must give a '--clusters' option that is greater than 0")
        exit(-1)

    samples = None
    for f in fnames:
        print("===-- Sampling file %s --" % f)
        df = pd.read_pickle(f)
        print("options.samples_per_file:", options.samples_per_file)
        print("df.shape:", df.shape)
        new_samples = df.sample(options.samples_per_file, replace=True,
                                random_state=prng)
        samples = new_samples.append(samples)
        print("samples.shape:", samples.shape)
        del df

    if options.computed:
        helper.cldata_add_computed_features(samples, options.verbose)

    # clustering_setup is:
    #   feats_used, scaler, clust
    clustering_setup = [None, None]
    todo = [CLUST_TYPE_FILES, CLUST_TYPE_USEFULNESS]
    todo = [CLUST_TYPE_FILES]
    for clust_type in todo:
        c = Clustering(samples, clust_type)
        clustering_setup[clust_type] = c.cluster()
        name = get_cluster_name(clust_type)
        if termcolor_avail:
            cprint("===-- K-Means clustering **created ** type: %s --" % name,
               "green", "on_grey")
        else:
            cprint("===-- K-Means clustering **created ** type: %s --" % name)

    del samples

    for f in fnames:
        print("===-- Clustering file %s" % (f))
        df_orig = pd.read_pickle(f)
        df = df_orig.copy()
        if options.computed:
            helper.cldata_add_computed_features(df, options.verbose)

        for clust_type in todo:
            name = get_cluster_name(clust_type)
            print("===-- type: %s --" % name)
            df_orig[name] = get_clustering_label(df, *clustering_setup[clust_type])
            print("Distribution: \n%s" % df_orig[name].value_counts())

        cleanname = re.sub(r'\.dat$', '', f)
        dump_dataframe(df_orig, cleanname+"-clustered.dat")
        del df
        del df_orig