Skip to content

Commit

Permalink
implement CI
Browse files Browse the repository at this point in the history
  • Loading branch information
antoine-galataud committed Apr 17, 2024
1 parent 94add97 commit 1b0f768
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 16 deletions.
123 changes: 108 additions & 15 deletions hopes/ope/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,89 @@ def check_parameters(self) -> None:
if not np.allclose(np.sum(array, axis=1), np.ones(array.shape[0], dtype=float)):
raise ValueError(f"The {name} must sum to 1 on each sample.")

def estimate_policy_value_with_confidence_interval(
self,
num_samples: int = 1000,
significance_level: float = 0.05,
) -> dict[str, float]:
"""Estimate the confidence interval of the policy value.
This method uses bootstrapping to estimate the confidence interval of the policy value. The input data is
sampled from the estimated weighted rewards, using :meth:`estimate_weighted_rewards`.
Example:
.. code-block:: python
ipw = InverseProbabilityWeighting()
ipw.set_parameters(
target_policy_action_probabilities=target_policy_action_probabilities,
behavior_policy_action_probabilities=behavior_policy_action_probabilities,
rewards=rewards,
)
metrics = ipw.estimate_policy_value_with_confidence_interval(
num_samples=1000, significance_level=0.05
)
print(metrics)
Should output:
.. code-block:: python
{
"lower_bound": 0.2,
"upper_bound": 4.0,
"mean": 3.2,
"std": 0.4,
}
:param num_samples: the number of bootstrap samples to use.
:param significance_level: the significance level of the confidence interval.
:return: a dictionary containing the confidence interval of the policy value. The keys are:
- "lower_bound": the lower bound of the policy value, given the significance level.
- "upper_bound": the upper bound of the policy value, given the significance level.
- "mean": the mean of the policy value.
- "std": the standard deviation of the policy value.
"""
weighted_rewards = self.estimate_weighted_rewards()
assert (
weighted_rewards is not None and len(weighted_rewards) > 0
), "The weighted rewards must not be empty."

weighted_rewards = weighted_rewards.reshape(-1)
boot_samples = []
for _ in np.arange(num_samples):
boot_samples.append(
np.mean(np.random.choice(num_samples, size=weighted_rewards.shape[0]))
)

lower_bound = np.quantile(boot_samples, significance_level / 2)
upper_bound = np.quantile(boot_samples, 1 - significance_level / 2)

return {
"lower_bound": lower_bound,
"upper_bound": upper_bound,
"mean": np.mean(boot_samples),
"std": np.std(boot_samples),
}

@abstractmethod
def estimate_weighted_rewards(self) -> np.ndarray:
"""Estimate the weighted rewards.
This method should be overridden by subclasses to implement the specific estimator.
:return: the weighted rewards.
"""
pass

@abstractmethod
def estimate_policy_value(self) -> float:
"""Estimate the value of the target policy.
This method should be overridden by subclasses to implement the specific estimator.
This method should be overridden by subclasses to implement the specific estimator. The typical implementation
should call :meth:`estimate_weighted_rewards` to compute the weighted rewards, then compute the policy value.
:return: the estimated value of the target policy.
"""
Expand Down Expand Up @@ -161,15 +239,20 @@ def __init__(self) -> None:
self.importance_weights: np.ndarray | None = None

@override(BaseEstimator)
def estimate_policy_value(self) -> float:
"""Estimate the value of the target policy using the IPW estimator."""
def estimate_weighted_rewards(self) -> np.ndarray:
"""Estimate the weighted rewards using the IPW estimator."""
self.importance_weights = None
self.check_parameters()

self.importance_weights = (
self.target_policy_action_probabilities / self.behavior_policy_action_probabilities
)
return np.mean(self.importance_weights * self.rewards.reshape(-1, 1))
return self.importance_weights * self.rewards.reshape(-1, 1)

@override(BaseEstimator)
def estimate_policy_value(self) -> float:
"""Estimate the value of the target policy using the IPW estimator."""
return np.mean(self.estimate_weighted_rewards())


class SelfNormalizedInverseProbabilityWeighting(InverseProbabilityWeighting):
Expand All @@ -192,16 +275,20 @@ def __init__(self) -> None:
super().__init__()

@override(BaseEstimator)
def estimate_policy_value(self) -> float:
"""Estimate the value of the target policy using the SNIPW estimator.
def estimate_weighted_rewards(self) -> np.ndarray:
"""Estimate the weighted rewards using the SNIPW estimator."""
super().estimate_weighted_rewards()

This essentially normalizes the importance weights to avoid high variance.
"""
super().estimate_policy_value()
weighted_rewards = self.importance_weights * self.rewards.reshape(-1, 1)
return weighted_rewards / self.importance_weights

return np.sum(self.importance_weights * self.rewards.reshape(-1, 1)) / np.sum(
self.importance_weights
)
@override(BaseEstimator)
def estimate_policy_value(self) -> float:
"""Estimate the value of the target policy using the SNIPW estimator."""
super().estimate_weighted_rewards()

weighted_rewards = self.importance_weights * self.rewards.reshape(-1, 1)
return np.sum(weighted_rewards) / np.sum(self.importance_weights)


class DirectMethod(BaseEstimator):
Expand Down Expand Up @@ -301,8 +388,8 @@ def check_parameters(self) -> None:
), "The number of samples must be the same for the behavior policy and the target policy."

@override(BaseEstimator)
def estimate_policy_value(self) -> float:
"""Estimate the value of the target policy using the Direct Method estimator."""
def estimate_weighted_rewards(self) -> np.ndarray:
"""Estimate the weighted rewards using the Direct Method estimator."""
self.check_parameters()

# use the Q model to predict the expected rewards
Expand All @@ -318,4 +405,10 @@ def estimate_policy_value(self) -> float:
.reshape(-1, self.steps_per_episode)
)
initial_state_value = state_value[:, 0]
return np.mean(initial_state_value)

return initial_state_value

@override(BaseEstimator)
def estimate_policy_value(self) -> float:
"""Estimate the value of the target policy using the Direct Method estimator."""
return np.mean(self.estimate_weighted_rewards())
33 changes: 32 additions & 1 deletion tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from action_probs_utils import generate_action_probs

from hopes.ope.estimators import (
BaseEstimator,
DirectMethod,
InverseProbabilityWeighting,
SelfNormalizedInverseProbabilityWeighting,
Expand Down Expand Up @@ -44,10 +45,16 @@ def test_ipw(self):
rewards=rewards,
)

wrew = ipw.estimate_weighted_rewards()
self.assertIsInstance(wrew, np.ndarray)
self.assertEqual(wrew.shape, (10, 3))
policy_value = ipw.estimate_policy_value()
self.assertIsInstance(policy_value, float)
self.assertGreaterEqual(policy_value, 0.0)

# test CI
self._test_ci(ipw)

# test with zero rewards
rewards = np.zeros(10)

Expand All @@ -74,14 +81,21 @@ def test_snipw(self):
rewards=rewards,
)

wrew = snipw.estimate_weighted_rewards()
self.assertIsInstance(wrew, np.ndarray)
self.assertEqual(wrew.shape, (10, 3))
policy_value = snipw.estimate_policy_value()
self.assertIsInstance(policy_value, float)
self.assertGreaterEqual(policy_value, 0.0)

# test CI
self._test_ci(snipw)

def test_dm(self):
num_actions = 3
num_obs = 10
num_samples = 100
num_steps_per_episode = 2
obs = np.random.rand(num_samples, num_obs)
act = np.random.randint(num_actions, size=num_samples)
rew = np.random.rand(num_samples)
Expand All @@ -96,7 +110,7 @@ def test_dm(self):
behavior_policy_obs=obs,
behavior_policy_act=act,
behavior_policy_rewards=rew,
steps_per_episode=2,
steps_per_episode=num_steps_per_episode,
)
fit_stats = dm.fit()
self.assertIsInstance(fit_stats, dict)
Expand All @@ -107,10 +121,16 @@ def test_dm(self):
behavior_policy_action_probabilities=None,
rewards=None,
)

wrew = dm.estimate_weighted_rewards()
self.assertIsInstance(wrew, np.ndarray)
self.assertEqual(wrew.shape, (num_samples // num_steps_per_episode,))
policy_value = dm.estimate_policy_value()
self.assertIsInstance(policy_value, float)
self.assertGreaterEqual(policy_value, 0.0)

self._test_ci(dm)

def test_neg_rewards(self):
ipw = InverseProbabilityWeighting()

Expand All @@ -125,3 +145,14 @@ def test_neg_rewards(self):
rewards=rewards,
)
self.assertTrue("The rewards must be non-negative" in str(e.exception))

def _test_ci(self, estimator: BaseEstimator):
# test CI
metrics = estimator.estimate_policy_value_with_confidence_interval(
num_samples=1000, significance_level=0.05
)
self.assertIsInstance(metrics, dict)
for m in ["mean", "lower_bound", "upper_bound", "std"]:
self.assertIn(m, metrics)
self.assertIsInstance(metrics[m], float)
self.assertTrue(metrics["lower_bound"] <= metrics["mean"] <= metrics["upper_bound"])

0 comments on commit 1b0f768

Please sign in to comment.