File: generate_random_data.py

package info (click to toggle)
python-pycm 4.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,048 kB
  • sloc: python: 5,178; sh: 8; makefile: 6
file content (188 lines) | stat: -rw-r--r-- 7,133 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# -*- coding: utf-8 -*-
"""This file contains a function to generate a random confusion matrix."""
import numpy as np
from enum import Enum
from itertools import product

from .cm import ConfusionMatrix
from .params import BENCHMARK_REPORT_TEMPLATE
from .params import BENCHMARK_CLASS_SIZES, BENCHMARK_POPULATION_SIZES


class ClassDistributionScenario(Enum):
    """
    Enum to represent different scenarios for generating class percentages.

    - UNIFORM: All classes have equal representation.
    - MAJORITY_CLASS: Only one class has a majority representation, others share the rest equally.
    - MINORITY_CLASS: Only one class has a minority representation, others share the rest equally.
    """

    UNIFORM = "uniform"
    MAJORITY_CLASS = "majority_class"
    MINORITY_CLASS = "minority_class"


def _generate_class_percentages(num_classes, scenario):
    """
    Generate class percentages based on the given scenario.

    :params num_classes: number of classes.
    :type num_classes: int
    :params scenario: the scenario to generate percentages for.
    :type scenario: scenario
    :return: list of percentages for each class.
    """
    if num_classes < 2:
        raise ValueError("Number of classes must be at least 2.")
    if scenario == ClassDistributionScenario.UNIFORM:
        # Equal percentage for all classes
        raw_ratio_list = [1] * num_classes
    elif scenario == ClassDistributionScenario.MAJORITY_CLASS:
        raw_ratio_list = [5] + [1] * (num_classes - 1)
    elif scenario == ClassDistributionScenario.MINORITY_CLASS:
        raw_ratio_list = [0.2] + [1] * (num_classes - 1)
    else:
        raise ValueError("Invalid scenario")

    return list(100 * np.array(raw_ratio_list) / np.sum(raw_ratio_list))


def _calculate_class_counts(class_percentages, total_population):
    """
    Calculate the number of samples for each class based on percentages and total population.

    :param class_percentages: dict of percentages for each class (sum should be 100)
    :type class_percentages: dict
    :param total_population: total number of samples
    :type total_population: int
    :return: dictionary of sample counts for each class
    """
    classes = list(class_percentages.keys())
    percentages = np.array(list(class_percentages.values()), dtype=float)

    if len(classes) < 2:
        raise ValueError("Number of classes must be at least 2.")

    normalized_percentages = percentages / percentages.sum()
    class_counts = (normalized_percentages * total_population).astype(int)

    remainder = total_population - class_counts.sum()
    if remainder > 0:
        class_counts[np.argmax(normalized_percentages)] += remainder

    return dict(zip(classes, class_counts.astype(int).tolist()))


def generate_confusion_matrix(class_percentages, total_population, seed=None):
    """
    Generate a random confusion matrix with given class percentages and total population.

    :param class_percentages: dict or list of percentages for each class (sum should be 100)
    :type class_percentages: dict or list
    :param  total_population: total number of samples in the confusion matrix
    :type total_population: int
    :param seed: random seed for reproducibility
    :type seed: int or None
    :return: confusion matrix as a dictionary
    """
    np.random.seed(seed)
    if total_population <= 0:
        raise ValueError("Total population must be positive.")
    if isinstance(class_percentages, list):
        class_percentages = dict(enumerate(class_percentages))

    class_labels = list(class_percentages.keys())
    num_classes = len(class_percentages)

    if num_classes < 2:
        raise ValueError("Number of classes must be at least 2.")

    class_counts = _calculate_class_counts(class_percentages, total_population)
    confusion_matrix = {
        actual: {pred: 0 for pred in class_labels} for actual in class_labels
    }

    for actual in class_labels:
        count = class_counts[actual]
        if count == 0:
            continue

        dirichlet_params = np.ones(num_classes)
        actual_idx = class_labels.index(actual)
        dirichlet_params[actual_idx] *= 10  # Bias toward correct class

        probs = np.random.dirichlet(dirichlet_params)
        predicted_counts = (probs * count).astype(int)

        remainder = count - predicted_counts.sum()
        if remainder > 0:
            predicted_counts[np.argmax(probs)] += remainder

        for pred_idx, pred_class in enumerate(class_labels):
            confusion_matrix[actual][pred_class] = int(predicted_counts[pred_idx])

    return confusion_matrix


def generate_confusion_matrix_with_scenario(
        num_classes,
        total_population,
        scenario=ClassDistributionScenario.UNIFORM,
        seed=None):
    """
    Generate a random confusion matrix based on the given scenario.

    :param num_classes: number of classes.
    :type num_classes: int
    :param total_population: total number of samples.
    :type total_population: int
    :param scenario: the scenario to generate the confusion matrix for.
    :type scenario: ClassDistributionScenario
    :param seed: random seed for reproducibility.
    :type seed: int or None
    :return: confusion matrix as a dictionary.
    """
    if isinstance(scenario, str):
        try:
            scenario = ClassDistributionScenario[scenario.upper()]
        except KeyError:
            raise ValueError("Invalid scenario. Must be one of {0}.".format(
                [sen.value for sen in ClassDistributionScenario]))
    class_percentages = _generate_class_percentages(num_classes, scenario)
    return generate_confusion_matrix(class_percentages=class_percentages,
                                     total_population=total_population,
                                     seed=seed)


def run_report_benchmark(seed=None, digits=10):
    """
    Benchmark the generation of some confusion matrices and print the report.

    :param seed: random seed for reproducibility.
    :type seed: int or None
    :param digits: number of digits to round the timings to.
    :type digits: int
    :return: None
    """
    Ns = BENCHMARK_POPULATION_SIZES
    Ms = BENCHMARK_CLASS_SIZES
    SCENARIOS = [s.value for s in ClassDistributionScenario]

    for N, M, scenario in product(Ns, Ms, SCENARIOS):
        confusion_matrix = generate_confusion_matrix_with_scenario(
            num_classes=M,
            total_population=N,
            scenario=scenario,
            seed=seed
        )
        confusion_matrix = ConfusionMatrix(matrix=confusion_matrix)
        print(BENCHMARK_REPORT_TEMPLATE.format(
            num_classes=M,
            total_population=N,
            scenario=scenario,
            timing_matrix_creation=round(confusion_matrix.timings.get("matrix_creation", None), digits),
            timing_class_statistics=round(confusion_matrix.timings.get("class_statistics", None), digits),
            timing_overall_statistics=round(confusion_matrix.timings.get("overall_statistics", None), digits),
            timing_total=round(confusion_matrix.timings.get("total", None), digits),
        ))