File: list_update.py

package info (click to toggle)
orange3 3.40.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,912 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (51 lines) | stat: -rw-r--r-- 1,564 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import json

import Orange

external_datasets = [
    ("iris_url", "https://raw.githubusercontent.com/biolab/orange3/master/Orange/datasets/iris.tab"),
]


def data_info(name, location):
    data = Orange.data.Table(location)
    domain = data.domain
    attr = data.domain.attributes
    class_var = data.domain.class_var
    return {
        'name': name,
        'location': location,
        'rows': len(data),
        'features': {
            'discrete': sum(a.is_discrete for a in attr),
            'continuous': sum(a.is_continuous for a in attr),
            'meta': len(domain.metas),
        },
        'missing': bool(data.has_missing()),
        'target': {
            'type': ('discrete' if domain.has_discrete_class else
                     'continuous' if domain.has_continuous_class else
                     ['discrete' if i.is_discrete else 'continuous'
                      for i in domain.class_vars] if len(domain.class_vars) > 1 else
                     False),
            'values': len(class_var.values) if domain.has_discrete_class else None,
        }
    }

if __name__ == "__main__":
    info = dict()

    for name, location in external_datasets:
        info[name] = data_info(name, location)

    for fname in os.listdir('.'):
        if not os.path.isfile(fname):
            continue
        name, ext = os.path.splitext(fname)
        if ext != '.tab':
            continue
        info[name] = data_info(name, fname)

    with open('datasets.info', 'w') as f:
        json.dump(info, f, indent=4, sort_keys=True)