File: hash_functions.py

package info (click to toggle)
pandas 2.2.3%2Bdfsg-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 66,784 kB
  • sloc: python: 422,228; ansic: 9,190; sh: 270; xml: 102; makefile: 83
file content (89 lines) | stat: -rw-r--r-- 2,393 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import numpy as np

import pandas as pd


class UniqueForLargePyObjectInts:
    def setup(self):
        lst = [x << 32 for x in range(5000)]
        self.arr = np.array(lst, dtype=np.object_)

    def time_unique(self):
        pd.unique(self.arr)


class Float64GroupIndex:
    # GH28303
    def setup(self):
        self.df = pd.date_range(
            start="1/1/2018", end="1/2/2018", periods=10**6
        ).to_frame()
        self.group_index = np.round(self.df.index.astype(int) / 10**9)

    def time_groupby(self):
        self.df.groupby(self.group_index).last()


class UniqueAndFactorizeArange:
    params = range(4, 16)
    param_names = ["exponent"]

    def setup(self, exponent):
        a = np.arange(10**4, dtype="float64")
        self.a2 = (a + 10**exponent).repeat(100)

    def time_factorize(self, exponent):
        pd.factorize(self.a2)

    def time_unique(self, exponent):
        pd.unique(self.a2)


class Unique:
    params = ["Int64", "Float64"]
    param_names = ["dtype"]

    def setup(self, dtype):
        self.ser = pd.Series(([1, pd.NA, 2] + list(range(100_000))) * 3, dtype=dtype)
        self.ser_unique = pd.Series(list(range(300_000)) + [pd.NA], dtype=dtype)

    def time_unique_with_duplicates(self, exponent):
        pd.unique(self.ser)

    def time_unique(self, exponent):
        pd.unique(self.ser_unique)


class NumericSeriesIndexing:
    params = [
        (np.int64, np.uint64, np.float64),
        (10**4, 10**5, 5 * 10**5, 10**6, 5 * 10**6),
    ]
    param_names = ["dtype", "N"]

    def setup(self, dtype, N):
        vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype)
        indices = pd.Index(vals)
        self.data = pd.Series(np.arange(N), index=indices)

    def time_loc_slice(self, index, N):
        # trigger building of mapping
        self.data.loc[:800]


class NumericSeriesIndexingShuffled:
    params = [
        (np.int64, np.uint64, np.float64),
        (10**4, 10**5, 5 * 10**5, 10**6, 5 * 10**6),
    ]
    param_names = ["dtype", "N"]

    def setup(self, dtype, N):
        vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype)
        np.random.shuffle(vals)
        indices = pd.Index(vals)
        self.data = pd.Series(np.arange(N), index=indices)

    def time_loc_slice(self, index, N):
        # trigger building of mapping
        self.data.loc[:800]