File: utils.py

package info (click to toggle)
pydataverse 0.3.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,168 kB
  • sloc: python: 4,862; sh: 61; makefile: 13
file content (669 lines) | stat: -rw-r--r-- 17,804 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
"""Helper functions."""

import csv
import json
import os
import pickle

from jsonschema import validate


CSV_JSON_COLS = [
    "otherId",
    "series",
    "author",
    "dsDescription",
    "subject",
    "keyword",
    "topicClassification",
    "language",
    "grantNumber",
    "dateOfCollection",
    "kindOfData",
    "dataSources",
    "otherReferences",
    "contributor",
    "relatedDatasets",
    "relatedMaterial",
    "datasetContact",
    "distributor",
    "producer",
    "publication",
    "software",
    "timePeriodCovered",
    "geographicUnit",
    "geographicBoundingBox",
    "geographicCoverage",
    "socialScienceNotes",
    "unitOfAnalysis",
    "universe",
    "targetSampleActualSize",
    "categories",
]


def read_file(filename, mode="r", encoding="utf-8"):
    """Read in a file.

    Parameters
    ----------
    filename : str
        Filename with full path.
    mode : str
        Read mode of file. Defaults to `r`. See more at
        https://docs.python.org/3.5/library/functions.html#open

    Returns
    -------
    str
        Returns data as string.

    """
    assert isinstance(filename, str)
    assert isinstance(mode, str)
    assert isinstance(encoding, str)

    with open(filename, mode, encoding=encoding) as f:
        data = f.read()

    assert isinstance(data, str)
    return data


def write_file(filename, data, mode="w", encoding="utf-8"):
    """Write data in a file.

    Parameters
    ----------
    filename : str
        Filename with full path.
    data : str
        Data to be stored.
    mode : str
        Read mode of file. Defaults to `w`. See more at
        https://docs.python.org/3.5/library/functions.html#open
    encoding : str
        Character encoding of file. Defaults to 'utf-8'.

    """
    assert isinstance(filename, str)
    assert isinstance(data, str)
    assert isinstance(mode, str)
    assert isinstance(encoding, str)

    with open(filename, mode, encoding=encoding) as f:
        f.write(data)


def read_json(filename: str, mode: str = "r", encoding: str = "utf-8") -> dict:
    """Read in a json file.

    See more about the json module at
    https://docs.python.org/3.5/library/json.html

    Parameters
    ----------
    filename : str
        Filename with full path.
    mode : str
        Read mode of file. Defaults to `w`. See more at
        https://docs.python.org/3.5/library/functions.html#open
    encoding : str
        Character encoding of file. Defaults to 'utf-8'.

    Returns
    -------
    dict
        Data as a json-formatted string.

    """
    # TODO: add kwargs
    with open(filename, mode, encoding=encoding) as f:
        data = json.load(f)

    return data


def write_json(filename, data, mode="w", encoding="utf-8"):
    """Write data to a json file.

    Parameters
    ----------
    filename : str
        Filename with full path.
    data : dict
        Data to be written in the JSON file.
    mode : str
        Write mode of file. Defaults to `w`. See more at
        https://docs.python.org/3/library/functions.html#open
    encoding : str
        Character encoding of file. Defaults to 'utf-8'.

    """
    with open(filename, mode, encoding=encoding) as f:
        json.dump(data, f, indent=2)


def read_pickle(filename):
    """Read in pickle file.

    See more at `pickle <https://docs.python.org/3/library/pickle.html>`_.

    Parameters
    ----------
    filename : str
        Full filename with path of file.

    Returns
    -------
    dict
        Data object.

    """
    assert isinstance(filename, str)

    with open(filename, "rb") as f:
        data = pickle.load(f)

        assert isinstance(data, dict)
        return data


def write_pickle(filename, data):
    """Write data in pickle file.

    See more at `pickle <https://docs.python.org/3/library/pickle.html>`_.

    Parameters
    ----------
    filename : str
        Full filename with path of file.
    data : dict
        Data to write in pickle file.

    """
    assert isinstance(filename, str)
    assert isinstance(data, dict)

    with open(filename, "wb") as f:
        pickle.dump(data, f)


def read_csv(filename, newline="", delimiter=",", quotechar='"', encoding="utf-8"):
    """Read in a CSV file.

    See more at `csv <https://docs.python.org/3/library/csv.html>`_.

    Parameters
    ----------
    filename : str
        Full filename with path of file.
    newline : str
        Newline character.
    delimiter : str
        Cell delimiter of CSV file. Defaults to ';'.
    quotechar : str
        Quote-character of CSV file. Defaults to '"'.
    encoding : str
        Character encoding of file. Defaults to 'utf-8'.

    Returns
    -------
    reader
        Reader object, which can be iterated over.

    """
    assert isinstance(filename, str)
    assert isinstance(newline, str)
    assert isinstance(delimiter, str)
    assert isinstance(quotechar, str)
    assert isinstance(encoding, str)

    with open(filename, newline=newline, encoding=encoding) as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar)
        assert isinstance(csv_reader, csv.reader)
        return csv_reader


def write_csv(
    data, filename, newline="", delimiter=",", quotechar='"', encoding="utf-8"
):
    """Short summary.

    See more at `csv <https://docs.python.org/3/library/csv.html>`_.

    Parameters
    ----------
    data : list
        List of :class:`dict`. Key is column, value is cell content.
    filename : str
        Full filename with path of file.
    newline : str
        Newline character.
    delimiter : str
        Cell delimiter of CSV file. Defaults to ';'.
    quotechar : str
        Quote-character of CSV file. Defaults to '"'.
    encoding : str
        Character encoding of file. Defaults to 'utf-8'.

    """
    assert isinstance(data, list)
    assert isinstance(filename, str)
    assert isinstance(newline, str)
    assert isinstance(delimiter, str)
    assert isinstance(quotechar, str)
    assert isinstance(encoding, str)

    with open(filename, "w", newline=newline, encoding=encoding) as csvfile:
        writer = csv.writer(csvfile, delimiter=delimiter, quotechar=quotechar)
        for row in data:
            writer.writerow(row)


def read_csv_as_dicts(
    filename,
    newline="",
    delimiter=",",
    quotechar='"',
    encoding="utf-8",
    remove_prefix=True,
    prefix="dv.",
    json_cols=CSV_JSON_COLS,
    false_values=["FALSE"],
    true_values=["TRUE"],
):
    """Read in CSV file into a list of :class:`dict`.

    This offers an easy import functionality of your data from CSV files.
    See more at
    `csv <https://docs.python.org/3/library/csv.html>`_.

    CSV file structure:
    1) The header row contains the column names.
    2) A row contains one dataset
    3) A column contains one specific attribute.

    Recommendation: Name the column name the way you want the attribute to be
    named later in your Dataverse object. See the
    `pyDataverse templates <https://github.com/GDCC/pyDataverse_templates>`_
    for this. The created :class:`dict` can later be used for the `set()`
    function to create Dataverse objects.

    Parameters
    ----------
    filename : str
        Filename with full path.
    newline : str
        Newline character.
    delimiter : str
        Cell delimiter of CSV file. Defaults to ';'.
    quotechar : str
        Quote-character of CSV file. Defaults to '"'.
    encoding : str
        Character encoding of file. Defaults to 'utf-8'.

    Returns
    -------
    list
        List with one :class:`dict` each row. The keys of a :class:`dict` are
        named after the columen names.

    """
    assert isinstance(filename, str)
    assert isinstance(newline, str)
    assert isinstance(delimiter, str)
    assert isinstance(quotechar, str)
    assert isinstance(encoding, str)

    with open(filename, "r", newline=newline, encoding=encoding) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=delimiter, quotechar=quotechar)
        data = []
        for row in reader:
            data.append(dict(row))

    data_tmp = []
    for ds in data:
        ds_tmp = {}
        for key, val in ds.items():
            if val in false_values:
                ds_tmp[key] = False
                ds_tmp[key] = True
            elif val in true_values:
                ds_tmp[key] = True
            else:
                ds_tmp[key] = val

        data_tmp.append(ds_tmp)
    data = data_tmp

    if remove_prefix:
        data_tmp = []
        for ds in data:
            ds_tmp = {}
            for key, val in ds.items():
                if key.startswith(prefix):
                    ds_tmp[key[len(prefix) :]] = val
                else:
                    ds_tmp[key] = val
            data_tmp.append(ds_tmp)
        data = data_tmp

    if len(json_cols) > 0:
        data_tmp = []
        for ds in data:
            ds_tmp = {}
            for key, val in ds.items():
                if key in json_cols:
                    ds_tmp[key] = json.loads(val)
                else:
                    ds_tmp[key] = val
            data_tmp.append(ds_tmp)
        data = data_tmp

    return data


def write_dicts_as_csv(data, fieldnames, filename, delimiter=",", quotechar='"'):
    """Write :class:`dict` to a CSV file

    This offers an easy export functionality of your data to a CSV files.
    See more at `csv <https://docs.python.org/3/library/csv.html>`_.

    Parameters
    ----------
    data : dict
        Dictionary with columns as keys, to be written in the CSV file.
    fieldnames : list
        Sequence of keys that identify the order of the columns.
    filename : str
        Filename with full path.
    delimiter : str
        Cell delimiter of CSV file. Defaults to ';'.
    quotechar : str
        Quote-character of CSV file. Defaults to '"'.

    """
    assert isinstance(data, str)
    assert isinstance(fieldnames, list)
    assert isinstance(filename, str)
    assert isinstance(delimiter, str)
    assert isinstance(quotechar, str)

    with open(filename, "w", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for d in data:
            for key, val in d.items():
                if isinstance(val, dict) or isinstance(val, list):
                    d[key] = json.dump(val)
            writer.writerow(d)


def clean_string(string):
    """Clean a string.

    Trims whitespace.

    Parameters
    ----------
    str : str
        String to be cleaned.

    Returns
    -------
    string
        Cleaned string.

    """
    assert isinstance(string, str)

    clean_str = string.strip()
    clean_str = clean_str.replace("  ", " ")

    assert isinstance(clean_str, str)
    return clean_str


def validate_data(data: dict, filename_schema: str, file_format: str = "json") -> bool:
    """Validate data against a schema.

    Parameters
    ----------
    data : dict
        Data to be validated.
    filename_schema : str
        Filename with full path of the schema file.
    file_format : str
        File format to be validated.

    Returns
    -------
    bool
        `True` if data was validated, `False` if not.

    """
    assert isinstance(data, dict)
    assert isinstance(filename_schema, str)
    assert isinstance(file_format, str)

    if file_format == "json":
        validate(instance=data, schema=read_json(filename_schema))
        return True
    elif file_format == "xml":
        print("INFO: Not implemented yet.")
        return False
    else:
        print("WARNING: No valid format passed.")
        return False


def create_dataverse_url(base_url, identifier):
    """Creates URL of Dataverse.

    Example: https://data.aussda.at/dataverse/autnes

    Parameters
    ----------
    base_url : str
        Base URL of Dataverse instance
    identifier : str
        Can either be a dataverse id (long), a dataverse alias (more
        robust), or the special value ``:root``.

    Returns
    -------
    str
        URL of the dataverse

    """
    assert isinstance(base_url, str)
    assert isinstance(identifier, str)

    base_url = base_url.rstrip("/")
    url = "{0}/dataverse/{1}".format(base_url, identifier)
    assert isinstance(url, str)
    return url


def create_dataset_url(base_url, identifier, is_pid):
    """Creates URL of Dataset.

    Example: https://data.aussda.at/dataset.xhtml?persistentId=doi:10.11587/CCESLK

    Parameters
    ----------
    base_url : str
        Base URL of Dataverse instance
    identifier : str
        Identifier of the dataset. Can be dataset id or persistent
        identifier of the dataset (e. g. doi).
    is_pid : bool
        ``True`` to use persistent identifier. ``False``, if not.

    Returns
    -------
    str
        URL of the dataset

    """
    assert isinstance(base_url, str)
    assert isinstance(identifier, str)
    assert isinstance(is_pid, bool)

    base_url = base_url.rstrip("/")
    if is_pid:
        url = "{0}/dataset.xhtml?persistentId={1}".format(base_url, identifier)
    else:
        url = "{0}/dataset.xhtml?id{1}".format(base_url, identifier)
    assert isinstance(url, str)
    return url


def create_datafile_url(base_url, identifier, is_filepid):
    """Creates URL of Datafile.

    Example
    - File ID: https://data.aussda.at/file.xhtml?persistentId=doi:10.11587/CCESLK/5RH5GK

    Parameters
    ----------
    base_url : str
        Base URL of Dataverse instance
    identifier : str
        Identifier of the datafile. Can be datafile id or persistent
        identifier of the datafile (e. g. doi).
    is_filepid : bool
        ``True`` to use persistent identifier. ``False``, if not.

    Returns
    -------
    str
        URL of the datafile

    """
    assert isinstance(base_url, str)
    assert isinstance(identifier, str)

    base_url = base_url.rstrip("/")
    if is_filepid:
        url = "{0}/file.xhtml?persistentId={1}".format(base_url, identifier)
    else:
        url = "{0}/file.xhtml?fileId={1}".format(base_url, identifier)
    assert isinstance(url, str)
    return url


def dataverse_tree_walker(
    data: list,
    dv_keys: list = ["dataverse_id", "dataverse_alias"],
    ds_keys: list = ["dataset_id", "pid"],
    df_keys: list = ["datafile_id", "filename", "pid", "label"],
) -> tuple:
    """Walk through a Dataverse tree by get_children().

    Recursively walk through the tree structure returned by ``get_children()``
    and extract the keys needed.

    Parameters
    ----------
    data : dict
        Tree data structure returned by ``get_children()``.
    dv_keys : list
        List of keys to be extracted from each Dataverse element.
    ds_keys : list
        List of keys to be extracted from each Dataset element.
    df_keys : list
        List of keys to be extracted from each Datafile element.

    Returns
    -------
    tuple
        (List of Dataverse, List of Datasets, List of Datafiles)
    """
    dataverses = []
    datasets = []
    datafiles = []

    if isinstance(data, list):
        for elem in data:
            dv, ds, df = dataverse_tree_walker(elem)
            dataverses += dv
            datasets += ds
            datafiles += df
    elif isinstance(data, dict):
        if data["type"] == "dataverse":
            dv_tmp = {}
            for key in dv_keys:
                if key in data:
                    dv_tmp[key] = data[key]
            dataverses.append(dv_tmp)
        elif data["type"] == "dataset":
            ds_tmp = {}
            for key in ds_keys:
                if key in data:
                    ds_tmp[key] = data[key]
            datasets.append(ds_tmp)
        elif data["type"] == "datafile":
            df_tmp = {}
            for key in df_keys:
                if key in data:
                    df_tmp[key] = data[key]
            datafiles.append(df_tmp)
        if "children" in data:
            if len(data["children"]) > 0:
                dv, ds, df = dataverse_tree_walker(data["children"])
                dataverses += dv
                datasets += ds
                datafiles += df
    return dataverses, datasets, datafiles


def save_tree_data(
    dataverses: list,
    datasets: list,
    datafiles: list,
    filename_dv: str = "dataverses.json",
    filename_ds: str = "datasets.json",
    filename_df: str = "datafiles.json",
    filename_md: str = "metadata.json",
) -> None:
    """Save lists from data returned by ``dv_tree_walker``.

    Collect lists of Dataverses, Datasets and Datafiles and save them in separated JSON files.

    Parameters
    ----------
    data : dict
        Tree data structure returned by ``get_children()``.
    filename_dv : str
        Filename with full path for the Dataverse JSON file.
    filename_ds : str
        Filename with full path for the Dataset JSON file.
    filename_df : str
        Filename with full path for the Datafile JSON file.
    filename_md : str
        Filename with full path for the metadata JSON file.
    """
    if os.path.isfile(filename_dv):
        os.remove(filename_dv)
    if os.path.isfile(filename_ds):
        os.remove(filename_ds)
    if os.path.isfile(filename_df):
        os.remove(filename_df)
    if len(dataverses) > 0:
        write_json(filename_dv, dataverses)
    if len(datasets) > 0:
        write_json(filename_ds, datasets)
    if len(datafiles) > 0:
        write_json(filename_df, datafiles)
    metadata = {
        "dataverses": len(dataverses),
        "datasets": len(datasets),
        "datafiles": len(datafiles),
    }
    write_json(filename_md, metadata)
    print(f"- Dataverses: {len(dataverses)}")
    print(f"- Datasets: {len(datasets)}")
    print(f"- Datafiles: {len(datafiles)}")