1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
|
# mypy: ignore-errors
import argparse
import json
import warnings
import pandas as pd # type: ignore[import-untyped]
from torch._inductor.autoheuristic.autoheuristic_utils import (
CHOICE_COL,
get_metadata_str_from_log,
)
# TODO (AlnisM): Fix these warnings
warnings.filterwarnings(
"ignore",
message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
)
warnings.filterwarnings(
"ignore",
message="DataFrameGroupBy.apply operated on the grouping columns.",
)
class AHTrain:
"""
Base class for AutoHeuristic training.
"""
def __init__(self) -> None:
self.parser = argparse.ArgumentParser()
self.add_base_arguments()
self.args = None
def add_base_arguments(self):
self.parser.add_argument(
"dataset",
type=str,
help="Path to text file containing data collected with AutoHeuristic.",
)
self.parser.add_argument(
"--nrows",
type=int,
default=None,
help="Only read first n rows of the dataset.",
)
self.parser.add_argument(
"--heuristic-name",
type=str,
default="learned_heuristic",
help="Name of the heuristic to be generated.",
)
self.parser.add_argument(
"--data",
nargs=2,
action="append",
metavar=("TYPE", "PATH"),
help="Specify name of datasets and file paths to be evaluated.",
)
self.parser.add_argument(
"--save-dot",
action="store_true",
help="Export heuristic to graphviz dot.",
)
self.parser.add_argument(
"--ranking",
type=int,
default=None,
help="""
Makes AutoHeuristic learn a heuristic that ranks choices instead of predicting a single choice.
The argument is the number of choices the heuristic will provide.
""",
)
def parse_args(self):
return self.parser.parse_args()
def parse_log(self, log_path, nrows=None):
(df, metadata) = self.deserialize_data(log_path)
numerical_features = metadata["numerical_features"]
categorical_features = metadata["categorical_features"]
choices = df[CHOICE_COL].unique().tolist()
features = numerical_features + categorical_features
if nrows is not None:
df = df.head(nrows)
df = self.filter_df(df)
return (df, metadata, features, categorical_features, choices)
def generate_heuristic(self):
self.args = self.parse_args()
self.main(
self.args.dataset,
self.args.data,
self.args.nrows,
self.args.heuristic_name,
self.args.save_dot,
self.args.ranking is not None,
)
def filter_df(self, df):
return df
def add_new_features(self, results):
return (results, [])
def add_real_datasets(self, datasets, other_datasets, cat_feature2cats):
if other_datasets:
for name, path in other_datasets:
(df_other, choices, _, _, _) = self.get_df(
path, cat_feature2cats=cat_feature2cats, apply_filters=False
)
datasets[name] = df_other
def handle_categorical_features(
self, cat_feature2cats, categorical_features, results
):
# Doing this here because if we create another df for testing purposes
# and that other df does not contain all categories for a categorical feature,
# pd.dummies will not create columns for the missing categories
if not cat_feature2cats:
cat_feature2cats = {}
for cat_feature in categorical_features:
if cat_feature in cat_feature2cats:
categories = cat_feature2cats[cat_feature]
else:
categories = results[cat_feature].unique()
cat_feature2cats[cat_feature] = categories
results[cat_feature] = pd.Categorical(
results[cat_feature], categories=categories
)
dummy_col_2_col_val = {}
for col in categorical_features:
unique_vals = results[col].unique()
for val in unique_vals:
dummy_col_2_col_val[f"{col}_{val}"] = (col, val)
# one-hot encode categorical features
results = pd.get_dummies(results, columns=categorical_features)
return (results, cat_feature2cats, dummy_col_2_col_val)
def gen_precondition(self, opt_name, shared_memory, device_capa):
return f""" def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
return (
metadata.name == self.get_name()
and metadata.shared_memory == {shared_memory}
and str(metadata.device_capa) == "{device_capa}"
)"""
def codegen_boilerplate(
self, heuristic_name, opt_name, threshold, shared_memory, device_capa, dt
):
pass
def gen_predict_fn_def(self):
pass
def write_heuristic_to_file(self, lines, heuristic_name):
output_file = (
f"../../../torch/_inductor/autoheuristic/artifacts/_{heuristic_name}.py"
)
path = f"{output_file}"
with open(path, "w") as f:
f.write("\n".join(lines) + "\n")
def deserialize_data(self, log_path):
json_string = get_metadata_str_from_log(log_path)
metadata = self.deserialize_metadata(json_string)
df = pd.read_csv(log_path, skiprows=1, on_bad_lines="skip")
return (df, metadata)
def deserialize_metadata(self, json_string):
return json.loads(json_string)
if __name__ == "__main__":
train = AHTrain()
train.generate_heuristic()
|