shift-happens-benchmark · zimmerrol · Jul 17, 2022 · Jul 11, 2022 · Jul 11, 2022 · Jul 11, 2022
diff --git a/shifthappens/tasks/__init__.py b/shifthappens/tasks/__init__.py
@@ -1,3 +1,7 @@
-"""Utility methods and classes for the benchmarkk's tasks and the individual tasks."""
+"""Utility methods and classes for the benchmark's tasks and the individual tasks."""
+
+from shifthappens.tasks import imagenet_c  # noqa: F401
+from shifthappens.tasks import imagenet_r  # noqa: F401
+from shifthappens.tasks import raccoons_ood  # noqa: F401
 
 from .base import Task  # noqa: F401
diff --git a/shifthappens/tasks/imagenet_c/README.rst b/shifthappens/tasks/imagenet_c/README.rst
@@ -0,0 +1,15 @@
+Example for a Shift Happens task on ImageNet-C
+==============================================
+Evaluate the classification accuracy on a single corruption type of the ImageNet-C dataset [1]. Each corruption type has 5 different
+severity levels. The raw images (before corruptions) in this dataset
+come from the validation set of ImageNet.
+
+While the dataset is ImageNet-C the task's definition is a bit different than the usual evaluation
+paradigm:
+
+- we allow the model to access the unlabeled test set separately for every corruption
+- we allow it to make it's prediction based on a batch of samples coming from the same corruption type.
+
+1. Benchmarking Neural Network Robustness to Common Corruptions and Perturbations.
+    Dan Hendrycks and Thomas Dietterich. 2019.
+
diff --git a/shifthappens/tasks/imagenet_c/__init__.py b/shifthappens/tasks/imagenet_c/__init__.py
diff --git a/examples/imagenet_c.py → shifthappens/tasks/imagenet_c/imagenet_c.py b/examples/imagenet_c.py → shifthappens/tasks/imagenet_c/imagenet_c.py
@@ -453,9 +453,3 @@ def _evaluate(self, model: sh_models.Model) -> TaskResult:
             mce=np.mean(mces).item(),
             summary_metrics={Metric.Robustness: "accuracy"},
         )
-
-
-if __name__ == "__main__":
-    from shifthappens.models.torchvision import ResNet18
-
-    sh_benchmark.evaluate_model(ResNet18(device="cpu", max_batch_size=128), "test_data")
diff --git a/shifthappens/tasks/imagenet_r/README.rst b/shifthappens/tasks/imagenet_r/README.rst
@@ -0,0 +1,8 @@
+Example for a Shift Happens task on ImageNet-R
+==============================================
+
+Measures the classification accuracy on ImageNet-R [1], a dataset containing different renditions of 200 classes of ImageNet (30000 samples in total).
+
+1. The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization.
+    Dan Hendrycks, Steven Basart, Norman Mu, Saurav Kadavath, Frank Wang, Evan Dorundo, Rahul Desai,
+    Tyler Zhu, Samyak Parajuli, Mike Guo, Dawn Song, Jacob Steinhardt and Justin Gilmer. 2021.
diff --git a/shifthappens/tasks/imagenet_r/__init__.py b/shifthappens/tasks/imagenet_r/__init__.py
diff --git a/examples/imagenet_r.py → shifthappens/tasks/imagenet_r/imagenet_r.py b/examples/imagenet_r.py → shifthappens/tasks/imagenet_r/imagenet_r.py
@@ -36,7 +36,7 @@ class ImageNetR(Task):
         (
             "imagenet-r.tar",
             "https://people.eecs.berkeley.edu/~hendrycks/imagenet-r.tar",
-            "A61312130A589D0CA1A8FCA1F2BD3337",
+            "a61312130a589d0ca1a8fca1f2bd3337",
         )
     ]
 
@@ -83,9 +83,3 @@ def _evaluate(self, model: sh_models.Model) -> TaskResult:
         return TaskResult(
             accuracy=accuracy, summary_metrics={Metric.Robustness: "accuracy"}
         )
-
-
-if __name__ == "__main__":
-    from shifthappens.models.torchvision import ResNet18
-
-    sh_benchmark.evaluate_model(ResNet18(device="cpu", max_batch_size=128), "data")
diff --git a/shifthappens/tasks/raccoons_ood/README.rst b/shifthappens/tasks/raccoons_ood/README.rst
@@ -0,0 +1,10 @@
+Example for a Shift Happens task on Raccoons dataset
+====================================================
+This task aims to evaluate models' out-of-distribution (OOD) detection on 200 raccoon images.
+Raccoons are not presented in ImageNet classes, so the task uses models'
+confidences (maximal predicted class probability) for the ImageNet validation set and
+raccoons images (ImageNet samples treated as class 1 and raccoons as class 0) to measure
+AUROC and FPR at TPR equal 0.95.
+
+The original dataset was collected by Dat Tran for the object detection task
+and can be found at https://github.com/datitran/raccoon_dataset.
diff --git a/shifthappens/tasks/raccoons_ood/__init__.py b/shifthappens/tasks/raccoons_ood/__init__.py
diff --git a/examples/raccoons_OOD.py → ...appens/tasks/raccoons_ood/raccoons_ood.py b/examples/raccoons_OOD.py → ...appens/tasks/raccoons_ood/raccoons_ood.py
@@ -114,12 +114,3 @@ def _evaluate(self, model: sh_models.Model) -> TaskResult:
                 Metric.Robustness: "accuracy",  # remove for pure OOD detection
             },
         )
-
-
-if __name__ == "__main__":
-    from shifthappens.models.torchvision import ResNet18
-
-    sh_benchmark.evaluate_model(
-        ResNet18(device="cpu", max_batch_size=128),
-        "data",
-    )