File: train.py

package info (click to toggle)
pytorch-cuda 2.6.0%2Bdfsg-7
  • links: PTS, VCS
  • area: contrib
  • in suites: forky, sid, trixie
  • size: 161,620 kB
  • sloc: python: 1,278,832; cpp: 900,322; ansic: 82,710; asm: 7,754; java: 3,363; sh: 2,811; javascript: 2,443; makefile: 597; ruby: 195; xml: 84; objc: 68
file content (179 lines) | stat: -rw-r--r-- 5,911 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# mypy: ignore-errors

import argparse
import json
import warnings

import pandas as pd  # type: ignore[import-untyped]

from torch._inductor.autoheuristic.autoheuristic_utils import (
    CHOICE_COL,
    get_metadata_str_from_log,
)


# TODO (AlnisM): Fix these warnings
warnings.filterwarnings(
    "ignore",
    message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
)
warnings.filterwarnings(
    "ignore",
    message="DataFrameGroupBy.apply operated on the grouping columns.",
)


class AHTrain:
    """
    Base class for AutoHeuristic training.
    """

    def __init__(self) -> None:
        self.parser = argparse.ArgumentParser()
        self.add_base_arguments()
        self.args = None

    def add_base_arguments(self):
        self.parser.add_argument(
            "dataset",
            type=str,
            help="Path to text file containing data collected with AutoHeuristic.",
        )
        self.parser.add_argument(
            "--nrows",
            type=int,
            default=None,
            help="Only read first n rows of the dataset.",
        )
        self.parser.add_argument(
            "--heuristic-name",
            type=str,
            default="learned_heuristic",
            help="Name of the heuristic to be generated.",
        )
        self.parser.add_argument(
            "--data",
            nargs=2,
            action="append",
            metavar=("TYPE", "PATH"),
            help="Specify name of datasets and file paths to be evaluated.",
        )
        self.parser.add_argument(
            "--save-dot",
            action="store_true",
            help="Export heuristic to graphviz dot.",
        )
        self.parser.add_argument(
            "--ranking",
            type=int,
            default=None,
            help="""
                Makes AutoHeuristic learn a heuristic that ranks choices instead of predicting a single choice.
                The argument is the number of choices the heuristic will provide.
            """,
        )

    def parse_args(self):
        return self.parser.parse_args()

    def parse_log(self, log_path, nrows=None):
        (df, metadata) = self.deserialize_data(log_path)
        numerical_features = metadata["numerical_features"]
        categorical_features = metadata["categorical_features"]
        choices = df[CHOICE_COL].unique().tolist()
        features = numerical_features + categorical_features
        if nrows is not None:
            df = df.head(nrows)
        df = self.filter_df(df)
        return (df, metadata, features, categorical_features, choices)

    def generate_heuristic(self):
        self.args = self.parse_args()
        self.main(
            self.args.dataset,
            self.args.data,
            self.args.nrows,
            self.args.heuristic_name,
            self.args.save_dot,
            self.args.ranking is not None,
        )

    def filter_df(self, df):
        return df

    def add_new_features(self, results):
        return (results, [])

    def add_real_datasets(self, datasets, other_datasets, cat_feature2cats):
        if other_datasets:
            for name, path in other_datasets:
                (df_other, choices, _, _, _) = self.get_df(
                    path, cat_feature2cats=cat_feature2cats, apply_filters=False
                )
                datasets[name] = df_other

    def handle_categorical_features(
        self, cat_feature2cats, categorical_features, results
    ):
        # Doing this here because if we create another df for testing purposes
        # and that other df does not contain all categories for a categorical feature,
        # pd.dummies will not create columns for the missing categories
        if not cat_feature2cats:
            cat_feature2cats = {}
        for cat_feature in categorical_features:
            if cat_feature in cat_feature2cats:
                categories = cat_feature2cats[cat_feature]
            else:
                categories = results[cat_feature].unique()
                cat_feature2cats[cat_feature] = categories
            results[cat_feature] = pd.Categorical(
                results[cat_feature], categories=categories
            )

        dummy_col_2_col_val = {}
        for col in categorical_features:
            unique_vals = results[col].unique()
            for val in unique_vals:
                dummy_col_2_col_val[f"{col}_{val}"] = (col, val)
        # one-hot encode categorical features
        results = pd.get_dummies(results, columns=categorical_features)
        return (results, cat_feature2cats, dummy_col_2_col_val)

    def gen_precondition(self, opt_name, shared_memory, device_capa):
        return f"""    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
        return (
            metadata.name == self.get_name()
            and metadata.shared_memory == {shared_memory}
            and str(metadata.device_capa) == "{device_capa}"
        )"""

    def codegen_boilerplate(
        self, heuristic_name, opt_name, threshold, shared_memory, device_capa, dt
    ):
        pass

    def gen_predict_fn_def(self):
        pass

    def write_heuristic_to_file(self, lines, heuristic_name):
        output_file = (
            f"../../../torch/_inductor/autoheuristic/artifacts/_{heuristic_name}.py"
        )
        path = f"{output_file}"
        with open(path, "w") as f:
            f.write("\n".join(lines) + "\n")

    def deserialize_data(self, log_path):
        json_string = get_metadata_str_from_log(log_path)
        metadata = self.deserialize_metadata(json_string)

        df = pd.read_csv(log_path, skiprows=1, on_bad_lines="skip")
        return (df, metadata)

    def deserialize_metadata(self, json_string):
        return json.loads(json_string)


if __name__ == "__main__":
    train = AHTrain()
    train.generate_heuristic()