File: summary.py

package info (click to toggle)
nvidia-cuda-toolkit 12.4.1-3
  • links: PTS, VCS
  • area: non-free
  • in suites: forky, sid
  • size: 18,505,836 kB
  • sloc: ansic: 203,477; cpp: 64,769; python: 34,699; javascript: 22,006; xml: 13,410; makefile: 3,085; sh: 2,343; perl: 352
file content (93 lines) | stat: -rw-r--r-- 3,257 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.

import pandas as pd

from collections import namedtuple

StatsInfo = namedtuple("StatsInfo", ["filename", "stats_df"])


def format_columns(df):
    """Format a predefined set of statistical columns.

    - The first letter of each column name is capitalized.
    - The statistical columns are ordered according to a predefined sequence.
    - Any remaining columns that are not part of the predefined set are
      appended at the end.
    """
    formatted_df = df.rename(
        {
            "25%": "Q1",
            "50%": "Median",
            "75%": "Q3",
            0.25: "Q1",
            0.5: "Median",
            0.75: "Q3",
        },
        axis="columns",
    )

    formatted_df.columns = formatted_df.columns.str.title()
    formatted_df = formatted_df.rename_axis(index=str.title)

    stats_cols = ["Count", "Mean", "Std", "Min", "Q1", "Median", "Q3", "Max", "Sum"]
    other_columns = [col for col in formatted_df.columns if col not in stats_cols]

    return formatted_df[stats_cols + other_columns]


def aggregate_stats_dfs(dfs):
    """Aggregate multiple statistical dataframes into one.

    The input dataframes should be formatted using the 'format_columns'
    function. They should include all the statistical columns present in the
    default pandas describe() function, along with the sum column.
    """
    stats_df = pd.concat(dfs)
    stats_gdf = stats_df.groupby(stats_df.index)

    sum_total = stats_gdf["Sum"].sum()
    count_total = stats_gdf["Count"].sum()
    weighted_mean = sum_total / count_total

    aggregated_df = pd.DataFrame(
        {
            "Count": count_total,
            "Mean": weighted_mean,
            "Std (approx)": stats_gdf["Std"].mean(),
            "Min": stats_gdf["Min"].min(),
            "Q1 (approx)": stats_gdf["Q1"].min(),
            "Median (approx)": stats_gdf["Median"].median(),
            "Q3 (approx)": stats_gdf["Q3"].max(),
            "Max": stats_gdf["Max"].max(),
            "Sum": sum_total,
        },
        index=stats_gdf.groups.keys(),
    ).round(1)
    aggregated_df.index.name = stats_df.index.name

    return aggregated_df


def describe_duration(series_groupby):
    """Generate descriptive statistics.

    This function extends the pandas describe() function by including the
    'sum' column for the given 'duration' of a grouped dataframe.
    """
    agg_df = series_groupby.agg(["min", "max", "count", "std", "mean", "sum"])
    quantile_df = series_groupby.quantile([0.25, 0.5, 0.75])

    quantile_df = quantile_df.unstack()
    quantile_df.columns = ["25%", "50%", "75%"]

    stats_df = pd.merge(agg_df, quantile_df, left_index=True, right_index=True)
    return format_columns(stats_df)