File: specs.py

package info (click to toggle)
python-bioframe 0.4.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,000 kB
  • sloc: python: 5,860; makefile: 38; sh: 13
file content (152 lines) | stat: -rw-r--r-- 4,421 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import collections
import pandas as pd
import numpy as np

__all__ = [
    "update_default_colnames",
    "is_chrom_dtype",
]

_rc = {"colnames": {"chrom": "chrom", "start": "start", "end": "end"}}


def _get_default_colnames():
    """
    Returns default column names.

    These defaults be updated with :func:`update_default_colnames`.

    Returns
    -------
    colnames : triplet (str, str, str)

    """
    return _rc["colnames"]["chrom"], _rc["colnames"]["start"], _rc["colnames"]["end"]


class update_default_colnames:
    def __init__(self, new_colnames):
        self._old_colnames = dict(_rc["colnames"])
        if isinstance(new_colnames, collections.abc.Iterable):
            if len(new_colnames) != 3:
                raise ValueError(
                    "Please, specify new columns using a list of "
                    "3 strings or a dict!"
                )
            (
                _rc["colnames"]["chrom"],
                _rc["colnames"]["start"],
                _rc["colnames"]["end"],
            ) = new_colnames
        elif isinstance(new_colnames, collections.abc.Mapping):
            _rc["colnames"].update(
                {
                    k: v
                    for k, v in new_colnames.items()
                    if k in ["chrom", "start", "end"]
                }
            )
        else:
            raise ValueError(
                "Please, specify new columns using a list of " "3 strings or a dict!"
            )

    def __enter__(self):
        return self

    def __exit__(self, *args):
        _rc["colnames"] = self._old_colnames


def _verify_columns(df, colnames, unique_cols=False, return_as_bool=False):
    """
    Raises ValueError if columns with colnames are not present in dataframe df.

    Parameters
    ----------
    df: pandas.DataFrame

    colnames: list of column names

    return_as_bool : bool
        If True, returns as a boolean instead of raising errors. Default False.

    """

    if not type(df) is pd.core.frame.DataFrame:
        if return_as_bool:
            return False
        raise ValueError("df is not a dataframe")

    if unique_cols:
        if len(set(colnames)) < len(colnames):
            raise ValueError("column names must be unique")

    if not set(colnames).issubset(df.columns):
        if return_as_bool:
            return False
        raise ValueError(
            ", ".join(set(colnames).difference(set(df.columns)))
            + " not in keys of df.columns"
        )
    if return_as_bool:
        return True


def _verify_column_dtypes(df, cols=None, return_as_bool=False):
    """
    Checks that dataframe `df` has chrom, start, end columns with valid dtypes.
    Raises TypeErrors if cols have invalid dtypes.

    Parameters
    ----------
    df : pandas.DataFrame

    cols : (str, str, str) or None
        The names of columns containing the chromosome, start and end of the
        genomic intervals, provided separately for each set. The default
        values are 'chrom', 'start', 'end'.

    return_as_bool : bool
        If true, returns as a boolean instead of raising errors. Default False.

    """
    ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
    if not _verify_columns(df, [ck1, sk1, ek1], return_as_bool=True):
        if return_as_bool:
            return False
        raise ValueError("could not verify columns")

    chrom_dtype, start_dtype, end_dtype = df.dtypes[[ck1, sk1, ek1]]

    if not is_chrom_dtype(chrom_dtype):
        if return_as_bool:
            return False
        raise TypeError(
            "invalid df['chrom'] dtype, must be object, string, or categorical"
        )
    if not pd.api.types.is_integer_dtype(start_dtype):
        if return_as_bool:
            return False
        raise TypeError("invalid df['start'] dtype, must be integer")

    if not pd.api.types.is_integer_dtype(end_dtype):
        if return_as_bool:
            return False
        raise TypeError("invalid df['end'] dtype, must be integer")

    if return_as_bool:
        return True


def is_chrom_dtype(chrom_dtype):
    """
    Returns True if dtype is any of the allowed bioframe chrom dtypes, False otherwise.
    """
    return np.any(
        [
            pd.api.types.is_string_dtype(chrom_dtype),
            pd.api.types.is_object_dtype(chrom_dtype),
            pd.api.types.is_categorical_dtype(chrom_dtype),
        ]
    )