File: aggregate_pandas.py

package info (click to toggle)
python-numpy-groupies 0.10.2-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 476 kB
  • sloc: python: 2,346; makefile: 12
file content (78 lines) | stat: -rw-r--r-- 2,326 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from functools import partial

import numpy as np
import pandas as pd

from .aggregate_numpy import _aggregate_base
from .utils import (
    aggregate_common_doc,
    allnan,
    anynan,
    check_dtype,
    funcs_no_separate_nan,
)


def _wrapper(group_idx, a, size, fill_value, func="sum", dtype=None, ddof=0, **kwargs):
    funcname = func.__name__ if callable(func) else func
    kwargs = dict()
    if funcname in ("var", "std"):
        kwargs["ddof"] = ddof
    df = pd.DataFrame({"group_idx": group_idx, "a": a})
    if func == "sort":
        grouped = df.groupby("group_idx", sort=True)
    else:
        grouped = df.groupby("group_idx", sort=False).aggregate(func, **kwargs)

    dtype = check_dtype(dtype, getattr(func, "__name__", funcname), a, size)
    if funcname.startswith("cum"):
        ret = grouped.values[:, 0]
    else:
        ret = np.full(size, fill_value, dtype=dtype)
        with np.errstate(invalid="ignore"):
            ret[grouped.index] = grouped.values[:, 0]
    return ret


_supported_funcs = "sum prod all any min max mean var std first last cumsum cumprod cummax cummin".split()
_impl_dict = {fn: partial(_wrapper, func=fn) for fn in _supported_funcs}
_impl_dict.update(
    ("nan" + fn, partial(_wrapper, func=fn)) for fn in _supported_funcs if fn not in funcs_no_separate_nan
)
_impl_dict.update(
    allnan=partial(_wrapper, func=allnan),
    anynan=partial(_wrapper, func=anynan),
    len=partial(_wrapper, func="count"),
    nanlen=partial(_wrapper, func="count"),
    argmax=partial(_wrapper, func="idxmax"),
    argmin=partial(_wrapper, func="idxmin"),
    nanargmax=partial(_wrapper, func="idxmax"),
    nanargmin=partial(_wrapper, func="idxmin"),
    generic=_wrapper,
)


def aggregate(group_idx, a, func="sum", size=None, fill_value=0, order="C", dtype=None, axis=None, **kwargs):
    return _aggregate_base(
        group_idx,
        a,
        size=size,
        fill_value=fill_value,
        order=order,
        dtype=dtype,
        func=func,
        axis=axis,
        _impl_dict=_impl_dict,
        is_pandas=True,
        **kwargs,
    )


aggregate.__doc__ = (
    """
    This is the pandas implementation of aggregate. It makes use of
    `pandas`'s groupby machienery and is mainly used for reference
    and benchmarking.
    """
    + aggregate_common_doc
)