Skip to content

Commit

Permalink
ref(seer grouping): Use new CircuitBreaker class, take 2 (#74898)
Browse files Browse the repository at this point in the history
This is a redo of #74563 (which got reverted), with the difference that the circuit breaker is now reinstantiated before each use, rather than being instantiated once at the module level. (The problem with that setup was that the module turns out to initialize before our database connections are established, meaning we only get the hardcoded default config rather than the one provided by the options automator.)
  • Loading branch information
lobsterkatie authored and Christinarlong committed Jul 26, 2024
1 parent c9255af commit 704220a
Show file tree
Hide file tree
Showing 8 changed files with 134 additions and 74 deletions.
1 change: 1 addition & 0 deletions src/sentry/conf/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -3435,6 +3435,7 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]:
SEER_HASH_GROUPING_RECORDS_DELETE_URL = (
f"/{SEER_SIMILARITY_MODEL_VERSION}/issues/similar-issues/grouping-record/delete-by-hash"
)
SEER_SIMILARITY_CIRCUIT_BREAKER_KEY = "seer.similarity"

SIMILARITY_BACKFILL_COHORT_MAP: dict[str, list[int]] = {}

Expand Down
40 changes: 9 additions & 31 deletions src/sentry/event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,7 @@
from sentry.utils.circuit_breaker import (
ERROR_COUNT_CACHE_KEY,
CircuitBreakerPassthrough,
CircuitBreakerTripped,
circuit_breaker_activated,
with_circuit_breaker,
)
from sentry.utils.dates import to_datetime
from sentry.utils.event import has_event_minified_stack_trace, has_stacktrace, is_handled
Expand Down Expand Up @@ -1538,13 +1536,18 @@ def _save_aggregate(
seer_matched_group = None

if should_call_seer_for_grouping(event, primary_hashes):
metrics.incr(
"grouping.similarity.did_call_seer",
# TODO: Consider lowering this (in all the spots this metric is
# collected) once we roll Seer grouping out more widely
sample_rate=1.0,
tags={"call_made": True, "blocker": "none"},
)
try:
# If the `projects:similarity-embeddings-grouping` feature is disabled,
# we'll still get back result metadata, but `seer_matched_group` will be None
seer_response_data, seer_matched_group = with_circuit_breaker(
"event_manager.get_seer_similar_issues",
lambda: get_seer_similar_issues(event, primary_hashes),
options.get("seer.similarity.circuit-breaker-config"),
seer_response_data, seer_matched_group = get_seer_similar_issues(
event, primary_hashes
)
event.data["seer_similarity"] = seer_response_data

Expand All @@ -1555,33 +1558,8 @@ def _save_aggregate(
"seer_similarity"
] = seer_response_data

metrics.incr(
"grouping.similarity.did_call_seer",
# TODO: Consider lowering this (in all the spots this metric is
# collected) once we roll Seer grouping out more widely
sample_rate=1.0,
tags={"call_made": True, "blocker": "none"},
)

except CircuitBreakerTripped:
# TODO: Do we want to include all of the conditions which cause us to log a
# `grouping.similarity.seer_call_blocked` metric (here and in
# `should_call_seer_for_grouping`) under a single outcome tag on the span
# and timer metric below and in `record_calculation_metric_with_result`
# (also below)? Right now they just fall into the `new_group` bucket.
metrics.incr(
"grouping.similarity.did_call_seer",
sample_rate=1.0,
tags={"call_made": False, "blocker": "circuit-breaker"},
)

# Insurance - in theory we shouldn't ever land here
except Exception as e:
metrics.incr(
"grouping.similarity.did_call_seer",
sample_rate=1.0,
tags={"call_made": True, "blocker": "none"},
)
sentry_sdk.capture_exception(
e, tags={"event": event.event_id, "project": project.id}
)
Expand Down
40 changes: 34 additions & 6 deletions src/sentry/grouping/ingest/seer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
from dataclasses import asdict

from django.conf import settings

from sentry import features, options
from sentry import ratelimits as ratelimiter
from sentry.eventstore.models import Event
Expand All @@ -17,6 +19,7 @@
killswitch_enabled,
)
from sentry.utils import metrics
from sentry.utils.circuit_breaker2 import CircuitBreaker
from sentry.utils.safe import get_path

logger = logging.getLogger("sentry.events.grouping")
Expand Down Expand Up @@ -45,12 +48,11 @@ def should_call_seer_for_grouping(event: Event, primary_hashes: CalculatedHashes
# (Checking the rate limit for calling Seer also increments the counter of how many times we've
# tried to call it, and if we fail any of the other checks, it shouldn't count as an attempt.
# Thus we only want to run the rate limit check if every other check has already succeeded.)
#
# Note: The circuit breaker check which might naturally be here alongside its killswitch
# and rate limiting friends instead happens in the `with_circuit_breaker` helper used where
# `get_seer_similar_issues` is actually called. (It has to be there in order for it to track
# errors arising from that call.)
if killswitch_enabled(project.id, event) or _ratelimiting_enabled(event, project):
if (
killswitch_enabled(project.id, event)
or _circuit_breaker_broken(event, project)
or _ratelimiting_enabled(event, project)
):
return False

return True
Expand Down Expand Up @@ -155,6 +157,32 @@ def _ratelimiting_enabled(event: Event, project: Project) -> bool:
return False


def _circuit_breaker_broken(event: Event, project: Project) -> bool:
breaker_config = options.get("seer.similarity.circuit-breaker-config")
circuit_breaker = CircuitBreaker(settings.SEER_SIMILARITY_CIRCUIT_BREAKER_KEY, breaker_config)
circuit_broken = not circuit_breaker.should_allow_request()

if circuit_broken:
logger.warning(
"should_call_seer_for_grouping.broken_circuit_breaker",
extra={
"event_id": event.event_id,
"project_id": project.id,
**breaker_config,
},
)
metrics.incr(
"grouping.similarity.broken_circuit_breaker",
)
metrics.incr(
"grouping.similarity.did_call_seer",
sample_rate=1.0,
tags={"call_made": False, "blocker": "circuit-breaker"},
)

return circuit_broken


def get_seer_similar_issues(
event: Event,
primary_hashes: CalculatedHashes,
Expand Down
15 changes: 12 additions & 3 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,12 +892,21 @@
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)

# TODO: The default error limit here was estimated based on EA traffic. (In an average 10 min
# period, there are roughly 35K events without matching hashes. About 2% of orgs are EA, so for
# simplicity, assume 2% of those events are from EA orgs. If we're willing to tolerate up to a 95%
# failure rate, then we need 35K * 0.02 * 0.95 events to fail to trip the breaker.)
#
# When we GA, we should multiply both the limits by 50 (to remove the 2% part of the current
# calculation), and remove this TODO.
register(
"seer.similarity.circuit-breaker-config",
type=Dict,
# TODO: For now we're using the defaults for everything but `allow_passthrough`. We may want to
# revisit that choice in the future.
default={"allow_passthrough": True},
default={
"error_limit": 666,
"error_limit_window": 600, # 10 min
"broken_state_duration": 300, # 5 min
},
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)

Expand Down
17 changes: 16 additions & 1 deletion src/sentry/seer/similarity/similar_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
from django.conf import settings
from urllib3.exceptions import MaxRetryError, TimeoutError

from sentry.conf.server import SEER_MAX_GROUPING_DISTANCE, SEER_SIMILAR_ISSUES_URL
from sentry import options
from sentry.conf.server import (
SEER_MAX_GROUPING_DISTANCE,
SEER_SIMILAR_ISSUES_URL,
SEER_SIMILARITY_CIRCUIT_BREAKER_KEY,
)
from sentry.models.grouphash import GroupHash
from sentry.net.http import connection_from_url
from sentry.seer.signed_seer_api import make_signed_seer_api_request
Expand All @@ -15,6 +20,7 @@
)
from sentry.tasks.delete_seer_grouping_records import delete_seer_grouping_records_by_hash
from sentry.utils import json, metrics
from sentry.utils.circuit_breaker2 import CircuitBreaker
from sentry.utils.json import JSONDecodeError, apply_key_filter

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -95,6 +101,11 @@ def get_similarity_data_from_seer(
)
return []

circuit_breaker = CircuitBreaker(
SEER_SIMILARITY_CIRCUIT_BREAKER_KEY,
options.get("seer.similarity.circuit-breaker-config"),
)

try:
response = make_signed_seer_api_request(
seer_grouping_connection_pool,
Expand All @@ -111,6 +122,7 @@ def get_similarity_data_from_seer(
sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
tags={**metric_tags, "outcome": "error", "error": type(e).__name__},
)
circuit_breaker.record_error()
return []

metric_tags["response_status"] = response.status
Expand All @@ -137,6 +149,9 @@ def get_similarity_data_from_seer(
},
)

if response.status >= 500:
circuit_breaker.record_error()

return []

try:
Expand Down
18 changes: 0 additions & 18 deletions tests/sentry/event_manager/grouping/test_seer_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from sentry.testutils.helpers.eventprocessing import save_new_event
from sentry.testutils.helpers.features import with_feature
from sentry.testutils.pytest.mocking import capture_results
from sentry.utils.circuit_breaker import with_circuit_breaker
from sentry.utils.types import NonNone


Expand Down Expand Up @@ -85,23 +84,6 @@ def test_obeys_seer_similarity_flags(self):
assert get_seer_similar_issues_return_values[0][1] == existing_event.group
assert new_event.group_id == existing_event.group_id

@patch("sentry.event_manager.should_call_seer_for_grouping", return_value=True)
@patch("sentry.event_manager.with_circuit_breaker", wraps=with_circuit_breaker)
@patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None))
def test_obeys_circult_breaker(
self, mock_get_seer_similar_issues: MagicMock, mock_with_circuit_breaker: MagicMock, _
):
with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=True):
save_new_event({"message": "Dogs are great!"}, self.project)
assert mock_with_circuit_breaker.call_count == 1
assert mock_get_seer_similar_issues.call_count == 1

with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=False):
save_new_event({"message": "Adopt don't shop"}, self.project)

assert mock_with_circuit_breaker.call_count == 2 # increased
assert mock_get_seer_similar_issues.call_count == 1 # didn't increase

@patch("sentry.event_manager.should_call_seer_for_grouping", return_value=True)
@patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None))
def test_calls_seer_if_no_group_found(self, mock_get_seer_similar_issues: MagicMock, _):
Expand Down
12 changes: 12 additions & 0 deletions tests/sentry/grouping/ingest/test_seer.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,18 @@ def test_obeys_project_ratelimit(self):
is expected_result
)

@with_feature("projects:similarity-embeddings-grouping")
def test_obeys_circuit_breaker(self):
for request_allowed, expected_result in [(True, True), (False, False)]:
with patch(
"sentry.grouping.ingest.seer.CircuitBreaker.should_allow_request",
return_value=request_allowed,
):
assert (
should_call_seer_for_grouping(self.event, self.primary_hashes)
is expected_result
)

@with_feature("projects:similarity-embeddings-grouping")
def test_obeys_customized_fingerprint_check(self):
default_fingerprint_event = Event(
Expand Down
65 changes: 50 additions & 15 deletions tests/sentry/seer/similarity/test_similar_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,15 @@ def test_no_groups_found(self, mock_seer_request: MagicMock, mock_metrics_incr:
tags={"response_status": 200, "outcome": "no_similar_groups"},
)

@mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error")
@mock.patch("sentry.seer.similarity.similar_issues.metrics.incr")
@mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
def test_bad_response_data(self, mock_seer_request: MagicMock, mock_metrics_incr: MagicMock):
def test_bad_response_data(
self,
mock_seer_request: MagicMock,
mock_metrics_incr: MagicMock,
mock_record_circuit_breaker_error: MagicMock,
):
cases: list[tuple[Any, str]] = [
(None, "AttributeError"),
([], "AttributeError"),
Expand Down Expand Up @@ -139,14 +145,20 @@ def test_bad_response_data(self, mock_seer_request: MagicMock, mock_metrics_incr
sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
tags={"response_status": 200, "outcome": "error", "error": expected_error},
)
assert mock_record_circuit_breaker_error.call_count == 0

mock_metrics_incr.reset_mock()

@mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error")
@mock.patch("sentry.seer.similarity.similar_issues.metrics.incr")
@mock.patch("sentry.seer.similarity.similar_issues.logger")
@mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
def test_redirect(
self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock
self,
mock_seer_request: MagicMock,
mock_logger: MagicMock,
mock_metrics_incr: MagicMock,
mock_record_circuit_breaker_error: MagicMock,
):
mock_seer_request.return_value = HTTPResponse(
status=308, headers={"location": "/new/and/improved/endpoint/"}
Expand All @@ -161,12 +173,18 @@ def test_redirect(
sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
tags={"response_status": 308, "outcome": "error", "error": "Redirect"},
)
assert mock_record_circuit_breaker_error.call_count == 0

@mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error")
@mock.patch("sentry.seer.similarity.similar_issues.metrics.incr")
@mock.patch("sentry.seer.similarity.similar_issues.logger")
@mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
def test_request_error(
self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock
self,
mock_seer_request: MagicMock,
mock_logger: MagicMock,
mock_metrics_incr: MagicMock,
mock_record_circuit_breaker_error: MagicMock,
):
for request_error, expected_error_tag in [
(TimeoutError, "TimeoutError"),
Expand All @@ -192,25 +210,42 @@ def test_request_error(
sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
tags={"outcome": "error", "error": expected_error_tag},
)
assert mock_record_circuit_breaker_error.call_count == 1

mock_logger.warning.reset_mock()
mock_metrics_incr.reset_mock()
mock_record_circuit_breaker_error.reset_mock()

@mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error")
@mock.patch("sentry.seer.similarity.similar_issues.metrics.incr")
@mock.patch("sentry.seer.similarity.similar_issues.logger")
@mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
def test_error_status(
self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock
self,
mock_seer_request: MagicMock,
mock_logger: MagicMock,
mock_metrics_incr: MagicMock,
mock_record_circuit_breaker_error: MagicMock,
):
mock_seer_request.return_value = HTTPResponse("No soup for you", status=403)
for response, status, counts_for_circuit_breaker in [
("No soup for you", 403, False),
("No soup, period", 500, True),
]:
mock_seer_request.return_value = HTTPResponse(response, status=status)

assert get_similarity_data_from_seer(self.request_params) == []
mock_logger.error.assert_called_with(
f"Received 403 when calling Seer endpoint {SEER_SIMILAR_ISSUES_URL}.",
extra={"response_data": "No soup for you"},
)
mock_metrics_incr.assert_any_call(
"seer.similar_issues_request",
sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
tags={"response_status": 403, "outcome": "error", "error": "RequestError"},
)
assert get_similarity_data_from_seer(self.request_params) == []
mock_logger.error.assert_called_with(
f"Received {status} when calling Seer endpoint {SEER_SIMILAR_ISSUES_URL}.",
extra={"response_data": response},
)
mock_metrics_incr.assert_any_call(
"seer.similar_issues_request",
sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
tags={"response_status": status, "outcome": "error", "error": "RequestError"},
)
assert mock_record_circuit_breaker_error.call_count == (
1 if counts_for_circuit_breaker else 0
)

@mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
def test_returns_sorted_results(self, mock_seer_request: MagicMock):
Expand Down

0 comments on commit 704220a

Please sign in to comment.