File: pandas_data.py

package info (click to toggle)
dyda 1.41.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 106,148 kB
  • sloc: python: 19,978; makefile: 189; sh: 11
file content (502 lines) | stat: -rw-r--r-- 14,211 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame, Series
from dyda_utils import tools
from dyda_utils import lab_tools
from dyda_utils import data


def _cond_wd(x):
    """Conditions to select entries for sign_diff"""
    return x.weekday()


def get_wd_series(date_series, fm='%Y-%m-%d'):
    """Convert Date string series to pd.DatetimeIndex
       and return a weekday series.

    @param date_series: Raw string series

    Keyword arguments:
    fm -- format of the input date (default: '%Y-%m-%d'

    """
    dates = pd.to_datetime(date_series, format=fm)
    wd = dates.apply(lambda x: _cond_wd(x))
    return dates, wd


def is_empty_df(df):
    """ Check if the DataFrame is empty """
    return df.empty


def rebin_df(df, nbins):
    """Rebin DataFrame"""

    return df.groupby(pd.qcut(df.index, nbins)).mean()


def sel_df_row(df, row_index):
    """ Select one row from DataFrame"""
    return df.iloc[row_index]


def df_to_lab_anno(df_lab):
    """ Revert DataFrame which was converted by lab_anno_to_df
        back to lab-format annotations dict """

    return df_lab.to_dict("records")


def transpose_anno(lab_result):
    """ Transpost annotations from list of dict to dict of list """

    if not lab_tools.if_result_match_lab_format(lab_result):
        print("[pandas_data] WARNING: No lab format result found")
        return None

    empty_anno = lab_tools._lab_annotation_dic()
    anno_t = {}
    for key in empty_anno:
        if key == "labinfo":
            continue
        if key not in anno_t.keys():
            anno_t[key] = []
        for anno in lab_result["annotations"]:
            found = False
            for anno_key in anno.keys():
                if anno_key == key:
                    anno_t[key].append(anno[anno_key])
                    found = True
            if not found:
                anno_t[key].append(None)
                cound = False

    return anno_t


def lab_anno_to_df(lab_result):
    """ Create Pandas.DataFrame of annotations
        0.00093 on gc to run one result
    """

    input_anno_t = transpose_anno(lab_result)
    if input_anno_t is None:
        return None

    anno_df = pd.DataFrame.from_dict(input_anno_t)
    return anno_df


def create_anno_df_and_concat(lab_results_list, debug=True):
    """ Create DataFrame from a list of lab_result
        0.00422 seconds on gc2 to concat three results
        Note: the performance can be bad if annotations are big
        > 100ms for AIKEA results
    """

    df_list = []
    df_key_list = []
    for i in range(0, len(lab_results_list)):
        result = lab_results_list[i]
        df = lab_anno_to_df(result)
        if is_empty_df(df) and debug:
            print('[pandas_data] WARNING: empty DataFrame detected')
            continue

        df_list.append(df)

        if not isinstance(result["filename"], str):
            df_key_list.append(str(i))
        elif len(result["filename"]) < 1:
            df_key_list.append(str(i))
        else:
            df_key_list.append(result["filename"])

        if len(df_list) == 0:
            print('[pandas_data] WARNING: no DataFrame concated')
            return None

    concat_df = pd.concat(df_list, keys=df_key_list)
    return concat_df


def group_df(df, groups, comp_rule='mean'):
    """ Group DataFrame based on the max value of target mean

    @param df: input DataFrame
    @groups: list of keys used for groupby (e.g. ['label', 'id'])

    Keyword arguments:
        comp_rule -- mean to compare the mean, none else to compare sum
    """

    if isinstance(groups, str) or isinstance(groups, int):
        groups = [groups]

    if not isinstance(groups, list):
        print('[pandas_data] input groups is not str, int or list.')
        return None

    if comp_rule == 'off':
        mean_df = df.groupby(groups)
    elif comp_rule == 'mean':
        mean_df = df.groupby(groups).mean(numeric_only=True)
    else:
        mean_df = df.groupby(groups).sum()

    return mean_df


def select_item_from_target_values(
        df, groups, sel_name, target, filter_rule='max', comp_rule='mean'):
    """ Select item value from DataFrame based on the max value of target mean

    @param df: input DataFrame
    @groups: list of keys used for groupby (e.g. ['label', 'id'])
    @sel_name: target value you want to select (label, track_id, etc)
    @target: the target which will be mean (e.g. confidence)

    Keyword arguments:
        filter_rule -- max to select max in mean, else to select min in mean
        comp_rule -- mean to compare the mean, else to compare sum

    0.00226 seconds on gc2 to group and get mean
    0.00014 seconds on gc2 to cal max
    0.00057 seconds on gc2 to extra target

    """

    mean_df = group_df(
        df, groups, comp_rule=comp_rule
    )

    if filter_rule == 'max':
        filter_value = mean_df[target].max()
    else:
        filter_value = mean_df[target].min()

    sel_index = groups.index(sel_name)
    selected_value = mean_df.loc[
        mean_df[target] == filter_value].index.values[0][sel_index]

    return selected_value, filter_value


def norm_df(raw_df, exclude=None):
    """Normalize pandas DataFrame

    @param raw_df: raw input dataframe

    Keyword arguments:
    exclude -- a list of columns to be excluded

    """

    if exclude is not None:
        excluded = raw_df[exclude]
        _r = raw_df.drop(exclude, axis=1)
        _r = (_r - _r.mean()) / (_r.max() - _r.min())
        return pd.merge(excluded, _r)
    else:
        return (raw_df - raw_df.mean()) / (raw_df.max() - raw_df.min())


def filter_df_col(df, key, filter_value):
    """Filter DataFrame by column values

    @param df: input DataFrame
    @param key: key to be filtered
    @param filter_value: value to be selected
                         (should match key type)
    """
    return df.loc[df[key] == filter_value]


def export_df_csv(df, csvfile="./df.csv"):
    """Export DataFrame to csv"""

    df.to_csv(csvfile)
    return True


def conv_csv_df(csvfile, target=0):
    """Convert csv file to dataframe

    @param csvfile: file name of the csv to be read

    Keyword arguments:

    target   -- target column (default: 0)

    """
    return pd.DataFrame.from_csv(csvfile)


def conv_to_df(array, ffields=None, target=None):
    """Convert array to pandas.DataFrame

    @param array: input array to be converted

    Keyword arguments:
    ffields -- json file of the fields (default: None)
    target  -- if ffields is specified, can also specified
               the target column to be used (default: None)

    """
    if ffields is not None:
        fields = data.parse_json(ffields)
        if isinstance(target, int):
            print('[pandas_data] Converting field from %s to target'
                  % fields[target])
            fields[target] = 'target'
        return pd.DataFrame(array, columns=fields)
    return pd.DataFrame(array)


def df_header(df):
    """Get the header of the DataFrame as a list"""

    header = df.columns.values.tolist()
    print('[pandas_data] DataFrame header:')
    print(header)
    return header


def read_json_to_df(fname, orient='columns', np=False):
    """Read json file as pandas DataFrame

    @param fname: input filename

    Keyword arguments:
    orient -- split/records/index/columns/values (default: 'columns')
    np     -- true to direct decoding to numpy arrays (default: False)
    @return pandas DataFranm

    """
    if tools.check_exist(fname):
        return pd.read_json(fname, orient=orient, numpy=np)


def read_jsons_to_df(flist, orient='columns', np=False):
    """Read json files as one pandas DataFrame

    @param fname: input file list

    Keyword arguments:
    orient -- split/records/index/columns/values (default: 'columns')
    np     -- true to direct decoding to numpy arrays (default: False)
    @return concated pandas DataFranm

    """
    dfs = []
    for f in flist:
        dfs.append(read_json_to_df(f, orient=orient, np=np))
    return pd.concat(dfs)


def write_df_json(self, df, fname='df.json'):
    """Wtite pandas.DataFrame to json output"""

    df.to_json(fname)
    print('[pandas_data] DataFrame is written to %s' % fname)


def conv_to_np(array):
    """Convert DataFrame or list to np.ndarray"""

    if type(array) in [DataFrame, Series]:
        return array.as_matrix()

    if isinstance(array, list):
        return np.array(array)

    if tools.is_np(array):
        return array

    print("[pandas_data] WARNING: the type of input array is not correct!")
    print(type(array))
    return array


def conv_csv_svmft(csvfile, target=0, ftype=float, classify=True):
    """Convert csv file to SVM format

    @param csvfile: file name of the csv to be read

    Keyword arguments:

    target   -- target column (default: 0)
    ftype    -- convert data to the type (default: None)
    classify -- true convert target to int type (default: True)

    """
    indata = tools.read_csv(csvfile, ftype=ftype)
    df = conv_to_df(indata)

    _data = df.drop(df.columns[[target]], axis=1)
    data = conv_to_np(_data)
    target = conv_to_np(df[target])

    write_svmft(target, data, classify=classify)


def write_svmft(target, data, classify=True,
                fname='./data.svmft'):
    """Output data with the format libsvm/wusvm accepts

    @param target: array of the target (1D)
    @param data: array of the data (multi-dimensional)

    Keyword arguments:
    classify -- true convert target to int type (default: True)
    fname    -- output file name (default: ./data.svmft)

    """

    length = data.check_len(target, data)
    if classify:
        target = conv_to_np(target)
        target = target.astype(int)

    with open(fname, 'w') as outf:
        for i in range(0, length):
            output = []
            output.append(str(target[i]))
            for j in range(0, len(data[i])):
                output.append(str(j + 1) + ':' + str(data[i][j]))
            output.append('\n')
            libsvm_format = ' '.join(output)
            outf.write(libsvm_format)


def append_data_to_df(df, row_name, column_name, to_append):
    """ let append data in DataFrame can act like a function"""
    df.loc[row_name, column_name] = to_append
    return(df)


def _integer_generator():
    """
        A generator that acts like range(n), n = infinity. This generator is
        writed for the purpose of automatically generate column name of
        DataFrame.
    """
    n = 0
    while True:
        yield n
        n += 1


def _record_time_in_df(event_name):
    """
       A generator that stores the DataFrame, time, and column name of
       record_time_in_df().
    """
    df = pd.DataFrame()
    gen = tools._record_time()
    col = _integer_generator()
    next(gen)
    column_name = yield

    while True:
        if column_name is None:
            column_name = next(col)
        column_name = (yield append_data_to_df(df, event_name,
                                               column_name,
                                               to_append=next(gen)))


def record_time_in_df(action=None,
                      gen=_record_time_in_df('event'),
                      **kwargs):
    """
       Call this function two times, and at the second call, this function
       would automatically output a DataFrame that records the time passing
       between two calling. Moreover, if call this function three times, it
       would automatically output a DataFrame that records the time passing
       of first to second call and second to third call.

       Example:
               record_time_in_df()
               code1
               record_time_in_df() ==> output                  0
                                               event  code1_time
               code2
               record_time_in_df() ==> output                  0           1
                                               event  code1_time  code2_time

       The default row name of DataFrame is event, and we can specify row name
       by specify a generator.

       Example:
               recorder1 = record_time_in_df(event_name='re1')
               code1
               record_time_in_df(gen=recorder1,  ==> output               0
                                 action='start')            re1  code1_time

       The default column name of DataFrame is from 0, and every time append
       a column plus 1. We can specify column name by set column name when
       record.

       Example:
              record_time_in_df()
              code
              record_time_in_df(column_name='run') ==> output              run
                                                              event  code_time

       Note: The first time call record_time_in_df() can not set column_name,
             or we will get error.

    """

    if action == 'start':
        event_name = kwargs.get('event_name', 'event')
        gen = _record_time_in_df(event_name)
        next(gen)
        return gen

    else:
        column_name = kwargs.get('column_name', None)

        return gen.send(column_name)


def drop_continuous_duplicates(df):
    '''
        function that remove continuous duplicates.
        If you want to remove all duplicats, you should
        use df.drop_duplicats().

        example:
        df >>    a b
              1  1 1
              2  1 1
              3  2 2
              4  2 2
              5  1 1

        drop_continuous_duplicates(df) >>    a b
                                          1  1 1
                                          3  2 2
                                          5  1 1
    '''
    same_as_previous = pd.DataFrame()
    for index, row in df.iterrows():
        if index == 0:
            compare_row = row
            same_as_previous.loc[index, 'same'] = 1
        else:
            if row.tolist() == compare_row.tolist():
                same_as_previous.loc[index, 'same'] = 0

            else:
                compare_row = row
                same_as_previous.loc[index, 'same'] = 1
    return(df[same_as_previous['same'] == 1])


def is_pandas_df(test_object):

    return isinstance(test_object, DataFrame)