Skip to content

Commit

Permalink
cleanup the stats functions
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbkoch committed Sep 7, 2024
1 parent ee78752 commit fc57d3e
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 33 deletions.
12 changes: 6 additions & 6 deletions docs/benchmarks/ebm-benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,13 @@
" max_samples = 1000000000000\n",
" min_features = 1\n",
" max_features = 1000000000000\n",
" if task.meta[\"n_rows\"] < min_samples:\n",
" if task.meta['n_samples'] < min_samples:\n",
" return []\n",
" if max_samples < task.meta[\"n_rows\"]:\n",
" if max_samples < task.meta['n_samples']:\n",
" return []\n",
" if task.meta[\"n_cols\"] < min_features:\n",
" if task.meta['n_features'] < min_features:\n",
" return []\n",
" if max_features < task.meta[\"n_cols\"]:\n",
" if max_features < task.meta['n_features']:\n",
" return []\n",
"\n",
" \n",
Expand Down Expand Up @@ -100,7 +100,7 @@
" except NameError:\n",
" duplicates = set()\n",
" global_duplicates = duplicates\n",
" key = (task.name, task.meta[\"n_rows\"], task.meta[\"n_cols\"])\n",
" key = (task.name, task.meta['n_samples'], task.meta['n_features'])\n",
" if key in duplicates:\n",
" print(f\"Excluding duplicate: {key}\")\n",
" return []\n",
Expand Down Expand Up @@ -247,7 +247,7 @@
" knn_params[\"n_jobs\"] = -1\n",
" aplr_params[\"m\"] = 3000\n",
"\n",
" if 3000 < trial.task.meta[\"n_cols\"]:\n",
" if 3000 < trial.task.meta['n_features']:\n",
" # TODO: EBMs can crash for now with too many interactions, so limit it until we have better fix\n",
" # Bioresponse with 1776 features works. Santander_transaction_value with 4991 features does not.\n",
" ebm_params[\"interactions\"] = 0\n",
Expand Down
2 changes: 1 addition & 1 deletion python/powerlift/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Yes, we run this for InterpretML on as many docker containers we can run in para

```python
def trial_filter(task):
if task.problem == "binary" and task.meta["n_rows"] <= 10000:
if task.problem == "binary" and task.meta["n_features"] <= 10000:
return ["rf", "svm"]
return []

Expand Down
4 changes: 2 additions & 2 deletions python/powerlift/powerlift/bench/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ def run(
trials,
reverse=True,
key=lambda x: (1 if x[1].meta["n_classes"] < 3 else x[1].meta["n_classes"])
* x[1].meta["n_cols"]
* x[1].meta["n_rows"],
* x[1].meta["n_features"]
* x[1].meta["n_samples"],
)
trials = np.array(trials, dtype=object)
n_fastest = int(len(trials) * 0.25)
Expand Down
13 changes: 4 additions & 9 deletions python/powerlift/powerlift/bench/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,7 @@ def get_assets(self, task_id: int):
def iter_available_tasks(
self, include_measures: bool = False
) -> Iterable[Mapping[str, object]]:
# WARNING: obsolete
self.check_allowed()
task_orms = self._session.query(db.Task)
for task_orm in task_orms:
Expand Down Expand Up @@ -1170,11 +1171,9 @@ def retrieve_openml(

if problem == "regression":
regression_stats(y, meta)
is_classification = False
elif problem in ["binary", "multiclass"]:
class_stats(y, meta)
is_classification = True
data_stats(X, y, is_classification, categorical_mask, meta)
data_stats(X, categorical_mask, meta)

supervised = SupervisedDataset(X, y, meta)
if cache_dir is not None:
Expand Down Expand Up @@ -1337,11 +1336,9 @@ def retrieve_catboost_50k(
}
if problem == "regression":
regression_stats(y, meta)
is_classification = False
elif problem in ["binary", "multiclass"]:
class_stats(y, meta)
is_classification = True
data_stats(X, y, is_classification, categorical_mask, meta)
data_stats(X, categorical_mask, meta)

supervised = SupervisedDataset(X, y, meta)
if cache_dir is not None:
Expand Down Expand Up @@ -1414,11 +1411,9 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
}
if problem == "regression":
regression_stats(y, meta)
is_classification = False
elif problem in ["binary", "multiclass"]:
class_stats(y, meta)
is_classification = True
data_stats(X, y, is_classification, categorical_mask, meta)
data_stats(X, categorical_mask, meta)

supervised = SupervisedDataset(X, y, meta)
if cache_dir is not None:
Expand Down
22 changes: 8 additions & 14 deletions python/powerlift/powerlift/measures/task_measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def class_stats(y: pd.Series, meta):
labels_min_cnt = np.min(labels_unique[1])
labels_max_cnt = np.max(labels_unique[1])

meta["n_classes"] = int(len(labels_unique))
meta["class_normalized_entropy"] = float(entropy(labels, normalized=True))
meta["num_classes"] = int(len(labels_unique))
meta["min_class_count"] = int(labels_min_cnt)
meta["max_class_count"] = int(labels_max_cnt)
meta["avg_class_count"] = float(np.average(labels_unique[1]))
Expand All @@ -78,19 +78,19 @@ def regression_stats(y: pd.Series, meta):
labels_avg = np.average(labels)
labels_max = max(labels)
labels_min = min(labels)
meta["n_classes"] = 0
meta["response_min_val"] = float(labels_min)
meta["response_avg_val"] = float(labels_avg)
meta["response_max_val"] = float(labels_max)


def data_stats(
X: pd.DataFrame, y, is_classification, categorical_mask: Iterable[bool], meta
X: pd.DataFrame, categorical_mask: Iterable[bool], meta
):
"""Computes data statistics on instances.
Args:
X (pd.DataFrame): Instances.
y (pd.DataFrame or pd.Series): outputs
categorical_mask (Iterable[bool]): Boolean mask on which columns are categorical.
Returns:
Expand All @@ -113,16 +113,10 @@ def data_stats(
avg_prop_special_values += prop_special_values
avg_prop_special_values /= X.shape[1]

n_classes = 0
if is_classification and y is not None:
n_classes = len(np.unique(y))
prop_cat_features = float(sum([int(x) for x in categorical_mask]))
prop_cat_features /= len(categorical_mask)

prop_cat_cols = float(sum([int(x) for x in categorical_mask]))
prop_cat_cols /= len(categorical_mask)

meta["n_rows"] = int(X.shape[0])
meta["n_cols"] = int(X.shape[1])
meta["n_classes"] = int(n_classes)
meta["prop_cat_cols"] = float(prop_cat_cols)
meta["row_col_ratio"] = float(X.shape[0]) / float(X.shape[1])
meta["n_samples"] = int(X.shape[0])
meta["n_features"] = int(X.shape[1])
meta["prop_cat_features"] = float(prop_cat_features)
meta["avg_prop_special_values"] = float(avg_prop_special_values)
2 changes: 1 addition & 1 deletion python/powerlift/tests/powerlift/bench/test_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def _err_handler(e):


def _trials(task):
if task.problem == "binary" and task.meta["n_rows"] <= 10000:
if task.problem == "binary" and task.meta["n_features"] <= 10000:
return ["rf", "svm"]
return []

Expand Down

0 comments on commit fc57d3e

Please sign in to comment.