From 704220acb630a8dabfa1025f4959b1efb93d5251 Mon Sep 17 00:00:00 2001 From: Katie Byers Date: Thu, 25 Jul 2024 09:39:33 -0700 Subject: [PATCH] ref(seer grouping): Use new CircuitBreaker class, take 2 (#74898) This is a redo of https://github.com/getsentry/sentry/pull/74563 (which got reverted), with the difference that the circuit breaker is now reinstantiated before each use, rather than being instantiated once at the module level. (The problem with that setup was that the module turns out to initialize before our database connections are established, meaning we only get the hardcoded default config rather than the one provided by the options automator.) --- src/sentry/conf/server.py | 1 + src/sentry/event_manager.py | 40 +++--------- src/sentry/grouping/ingest/seer.py | 40 ++++++++++-- src/sentry/options/defaults.py | 15 ++++- src/sentry/seer/similarity/similar_issues.py | 17 ++++- .../grouping/test_seer_grouping.py | 18 ----- tests/sentry/grouping/ingest/test_seer.py | 12 ++++ .../seer/similarity/test_similar_issues.py | 65 ++++++++++++++----- 8 files changed, 134 insertions(+), 74 deletions(-) diff --git a/src/sentry/conf/server.py b/src/sentry/conf/server.py index fe139502dee4af..4d34481033cec0 100644 --- a/src/sentry/conf/server.py +++ b/src/sentry/conf/server.py @@ -3435,6 +3435,7 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]: SEER_HASH_GROUPING_RECORDS_DELETE_URL = ( f"/{SEER_SIMILARITY_MODEL_VERSION}/issues/similar-issues/grouping-record/delete-by-hash" ) +SEER_SIMILARITY_CIRCUIT_BREAKER_KEY = "seer.similarity" SIMILARITY_BACKFILL_COHORT_MAP: dict[str, list[int]] = {} diff --git a/src/sentry/event_manager.py b/src/sentry/event_manager.py index 95728d49b980ce..cd4db48a36ec81 100644 --- a/src/sentry/event_manager.py +++ b/src/sentry/event_manager.py @@ -129,9 +129,7 @@ from sentry.utils.circuit_breaker import ( ERROR_COUNT_CACHE_KEY, CircuitBreakerPassthrough, - CircuitBreakerTripped, circuit_breaker_activated, - with_circuit_breaker, ) from sentry.utils.dates import to_datetime from sentry.utils.event import has_event_minified_stack_trace, has_stacktrace, is_handled @@ -1538,13 +1536,18 @@ def _save_aggregate( seer_matched_group = None if should_call_seer_for_grouping(event, primary_hashes): + metrics.incr( + "grouping.similarity.did_call_seer", + # TODO: Consider lowering this (in all the spots this metric is + # collected) once we roll Seer grouping out more widely + sample_rate=1.0, + tags={"call_made": True, "blocker": "none"}, + ) try: # If the `projects:similarity-embeddings-grouping` feature is disabled, # we'll still get back result metadata, but `seer_matched_group` will be None - seer_response_data, seer_matched_group = with_circuit_breaker( - "event_manager.get_seer_similar_issues", - lambda: get_seer_similar_issues(event, primary_hashes), - options.get("seer.similarity.circuit-breaker-config"), + seer_response_data, seer_matched_group = get_seer_similar_issues( + event, primary_hashes ) event.data["seer_similarity"] = seer_response_data @@ -1555,33 +1558,8 @@ def _save_aggregate( "seer_similarity" ] = seer_response_data - metrics.incr( - "grouping.similarity.did_call_seer", - # TODO: Consider lowering this (in all the spots this metric is - # collected) once we roll Seer grouping out more widely - sample_rate=1.0, - tags={"call_made": True, "blocker": "none"}, - ) - - except CircuitBreakerTripped: - # TODO: Do we want to include all of the conditions which cause us to log a - # `grouping.similarity.seer_call_blocked` metric (here and in - # `should_call_seer_for_grouping`) under a single outcome tag on the span - # and timer metric below and in `record_calculation_metric_with_result` - # (also below)? Right now they just fall into the `new_group` bucket. - metrics.incr( - "grouping.similarity.did_call_seer", - sample_rate=1.0, - tags={"call_made": False, "blocker": "circuit-breaker"}, - ) - # Insurance - in theory we shouldn't ever land here except Exception as e: - metrics.incr( - "grouping.similarity.did_call_seer", - sample_rate=1.0, - tags={"call_made": True, "blocker": "none"}, - ) sentry_sdk.capture_exception( e, tags={"event": event.event_id, "project": project.id} ) diff --git a/src/sentry/grouping/ingest/seer.py b/src/sentry/grouping/ingest/seer.py index f8de83a22c0b73..f01145519dae0c 100644 --- a/src/sentry/grouping/ingest/seer.py +++ b/src/sentry/grouping/ingest/seer.py @@ -1,6 +1,8 @@ import logging from dataclasses import asdict +from django.conf import settings + from sentry import features, options from sentry import ratelimits as ratelimiter from sentry.eventstore.models import Event @@ -17,6 +19,7 @@ killswitch_enabled, ) from sentry.utils import metrics +from sentry.utils.circuit_breaker2 import CircuitBreaker from sentry.utils.safe import get_path logger = logging.getLogger("sentry.events.grouping") @@ -45,12 +48,11 @@ def should_call_seer_for_grouping(event: Event, primary_hashes: CalculatedHashes # (Checking the rate limit for calling Seer also increments the counter of how many times we've # tried to call it, and if we fail any of the other checks, it shouldn't count as an attempt. # Thus we only want to run the rate limit check if every other check has already succeeded.) - # - # Note: The circuit breaker check which might naturally be here alongside its killswitch - # and rate limiting friends instead happens in the `with_circuit_breaker` helper used where - # `get_seer_similar_issues` is actually called. (It has to be there in order for it to track - # errors arising from that call.) - if killswitch_enabled(project.id, event) or _ratelimiting_enabled(event, project): + if ( + killswitch_enabled(project.id, event) + or _circuit_breaker_broken(event, project) + or _ratelimiting_enabled(event, project) + ): return False return True @@ -155,6 +157,32 @@ def _ratelimiting_enabled(event: Event, project: Project) -> bool: return False +def _circuit_breaker_broken(event: Event, project: Project) -> bool: + breaker_config = options.get("seer.similarity.circuit-breaker-config") + circuit_breaker = CircuitBreaker(settings.SEER_SIMILARITY_CIRCUIT_BREAKER_KEY, breaker_config) + circuit_broken = not circuit_breaker.should_allow_request() + + if circuit_broken: + logger.warning( + "should_call_seer_for_grouping.broken_circuit_breaker", + extra={ + "event_id": event.event_id, + "project_id": project.id, + **breaker_config, + }, + ) + metrics.incr( + "grouping.similarity.broken_circuit_breaker", + ) + metrics.incr( + "grouping.similarity.did_call_seer", + sample_rate=1.0, + tags={"call_made": False, "blocker": "circuit-breaker"}, + ) + + return circuit_broken + + def get_seer_similar_issues( event: Event, primary_hashes: CalculatedHashes, diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 4426ae5b7d5014..65f1361143159f 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -892,12 +892,21 @@ flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE, ) +# TODO: The default error limit here was estimated based on EA traffic. (In an average 10 min +# period, there are roughly 35K events without matching hashes. About 2% of orgs are EA, so for +# simplicity, assume 2% of those events are from EA orgs. If we're willing to tolerate up to a 95% +# failure rate, then we need 35K * 0.02 * 0.95 events to fail to trip the breaker.) +# +# When we GA, we should multiply both the limits by 50 (to remove the 2% part of the current +# calculation), and remove this TODO. register( "seer.similarity.circuit-breaker-config", type=Dict, - # TODO: For now we're using the defaults for everything but `allow_passthrough`. We may want to - # revisit that choice in the future. - default={"allow_passthrough": True}, + default={ + "error_limit": 666, + "error_limit_window": 600, # 10 min + "broken_state_duration": 300, # 5 min + }, flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE, ) diff --git a/src/sentry/seer/similarity/similar_issues.py b/src/sentry/seer/similarity/similar_issues.py index f7b6fbd02df1d0..aa44b16bd3d6f4 100644 --- a/src/sentry/seer/similarity/similar_issues.py +++ b/src/sentry/seer/similarity/similar_issues.py @@ -3,7 +3,12 @@ from django.conf import settings from urllib3.exceptions import MaxRetryError, TimeoutError -from sentry.conf.server import SEER_MAX_GROUPING_DISTANCE, SEER_SIMILAR_ISSUES_URL +from sentry import options +from sentry.conf.server import ( + SEER_MAX_GROUPING_DISTANCE, + SEER_SIMILAR_ISSUES_URL, + SEER_SIMILARITY_CIRCUIT_BREAKER_KEY, +) from sentry.models.grouphash import GroupHash from sentry.net.http import connection_from_url from sentry.seer.signed_seer_api import make_signed_seer_api_request @@ -15,6 +20,7 @@ ) from sentry.tasks.delete_seer_grouping_records import delete_seer_grouping_records_by_hash from sentry.utils import json, metrics +from sentry.utils.circuit_breaker2 import CircuitBreaker from sentry.utils.json import JSONDecodeError, apply_key_filter logger = logging.getLogger(__name__) @@ -95,6 +101,11 @@ def get_similarity_data_from_seer( ) return [] + circuit_breaker = CircuitBreaker( + SEER_SIMILARITY_CIRCUIT_BREAKER_KEY, + options.get("seer.similarity.circuit-breaker-config"), + ) + try: response = make_signed_seer_api_request( seer_grouping_connection_pool, @@ -111,6 +122,7 @@ def get_similarity_data_from_seer( sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, tags={**metric_tags, "outcome": "error", "error": type(e).__name__}, ) + circuit_breaker.record_error() return [] metric_tags["response_status"] = response.status @@ -137,6 +149,9 @@ def get_similarity_data_from_seer( }, ) + if response.status >= 500: + circuit_breaker.record_error() + return [] try: diff --git a/tests/sentry/event_manager/grouping/test_seer_grouping.py b/tests/sentry/event_manager/grouping/test_seer_grouping.py index 8671da11790e1b..3180bbcd768ba5 100644 --- a/tests/sentry/event_manager/grouping/test_seer_grouping.py +++ b/tests/sentry/event_manager/grouping/test_seer_grouping.py @@ -10,7 +10,6 @@ from sentry.testutils.helpers.eventprocessing import save_new_event from sentry.testutils.helpers.features import with_feature from sentry.testutils.pytest.mocking import capture_results -from sentry.utils.circuit_breaker import with_circuit_breaker from sentry.utils.types import NonNone @@ -85,23 +84,6 @@ def test_obeys_seer_similarity_flags(self): assert get_seer_similar_issues_return_values[0][1] == existing_event.group assert new_event.group_id == existing_event.group_id - @patch("sentry.event_manager.should_call_seer_for_grouping", return_value=True) - @patch("sentry.event_manager.with_circuit_breaker", wraps=with_circuit_breaker) - @patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None)) - def test_obeys_circult_breaker( - self, mock_get_seer_similar_issues: MagicMock, mock_with_circuit_breaker: MagicMock, _ - ): - with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=True): - save_new_event({"message": "Dogs are great!"}, self.project) - assert mock_with_circuit_breaker.call_count == 1 - assert mock_get_seer_similar_issues.call_count == 1 - - with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=False): - save_new_event({"message": "Adopt don't shop"}, self.project) - - assert mock_with_circuit_breaker.call_count == 2 # increased - assert mock_get_seer_similar_issues.call_count == 1 # didn't increase - @patch("sentry.event_manager.should_call_seer_for_grouping", return_value=True) @patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None)) def test_calls_seer_if_no_group_found(self, mock_get_seer_similar_issues: MagicMock, _): diff --git a/tests/sentry/grouping/ingest/test_seer.py b/tests/sentry/grouping/ingest/test_seer.py index 9554fa2dc2832f..470fea3b596294 100644 --- a/tests/sentry/grouping/ingest/test_seer.py +++ b/tests/sentry/grouping/ingest/test_seer.py @@ -121,6 +121,18 @@ def test_obeys_project_ratelimit(self): is expected_result ) + @with_feature("projects:similarity-embeddings-grouping") + def test_obeys_circuit_breaker(self): + for request_allowed, expected_result in [(True, True), (False, False)]: + with patch( + "sentry.grouping.ingest.seer.CircuitBreaker.should_allow_request", + return_value=request_allowed, + ): + assert ( + should_call_seer_for_grouping(self.event, self.primary_hashes) + is expected_result + ) + @with_feature("projects:similarity-embeddings-grouping") def test_obeys_customized_fingerprint_check(self): default_fingerprint_event = Event( diff --git a/tests/sentry/seer/similarity/test_similar_issues.py b/tests/sentry/seer/similarity/test_similar_issues.py index 5efd172f635fde..694f13e0aaa784 100644 --- a/tests/sentry/seer/similarity/test_similar_issues.py +++ b/tests/sentry/seer/similarity/test_similar_issues.py @@ -96,9 +96,15 @@ def test_no_groups_found(self, mock_seer_request: MagicMock, mock_metrics_incr: tags={"response_status": 200, "outcome": "no_similar_groups"}, ) + @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error") @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr") @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") - def test_bad_response_data(self, mock_seer_request: MagicMock, mock_metrics_incr: MagicMock): + def test_bad_response_data( + self, + mock_seer_request: MagicMock, + mock_metrics_incr: MagicMock, + mock_record_circuit_breaker_error: MagicMock, + ): cases: list[tuple[Any, str]] = [ (None, "AttributeError"), ([], "AttributeError"), @@ -139,14 +145,20 @@ def test_bad_response_data(self, mock_seer_request: MagicMock, mock_metrics_incr sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, tags={"response_status": 200, "outcome": "error", "error": expected_error}, ) + assert mock_record_circuit_breaker_error.call_count == 0 mock_metrics_incr.reset_mock() + @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error") @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr") @mock.patch("sentry.seer.similarity.similar_issues.logger") @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") def test_redirect( - self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock + self, + mock_seer_request: MagicMock, + mock_logger: MagicMock, + mock_metrics_incr: MagicMock, + mock_record_circuit_breaker_error: MagicMock, ): mock_seer_request.return_value = HTTPResponse( status=308, headers={"location": "/new/and/improved/endpoint/"} @@ -161,12 +173,18 @@ def test_redirect( sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, tags={"response_status": 308, "outcome": "error", "error": "Redirect"}, ) + assert mock_record_circuit_breaker_error.call_count == 0 + @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error") @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr") @mock.patch("sentry.seer.similarity.similar_issues.logger") @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") def test_request_error( - self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock + self, + mock_seer_request: MagicMock, + mock_logger: MagicMock, + mock_metrics_incr: MagicMock, + mock_record_circuit_breaker_error: MagicMock, ): for request_error, expected_error_tag in [ (TimeoutError, "TimeoutError"), @@ -192,25 +210,42 @@ def test_request_error( sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, tags={"outcome": "error", "error": expected_error_tag}, ) + assert mock_record_circuit_breaker_error.call_count == 1 + + mock_logger.warning.reset_mock() + mock_metrics_incr.reset_mock() + mock_record_circuit_breaker_error.reset_mock() + @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error") @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr") @mock.patch("sentry.seer.similarity.similar_issues.logger") @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") def test_error_status( - self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock + self, + mock_seer_request: MagicMock, + mock_logger: MagicMock, + mock_metrics_incr: MagicMock, + mock_record_circuit_breaker_error: MagicMock, ): - mock_seer_request.return_value = HTTPResponse("No soup for you", status=403) + for response, status, counts_for_circuit_breaker in [ + ("No soup for you", 403, False), + ("No soup, period", 500, True), + ]: + mock_seer_request.return_value = HTTPResponse(response, status=status) - assert get_similarity_data_from_seer(self.request_params) == [] - mock_logger.error.assert_called_with( - f"Received 403 when calling Seer endpoint {SEER_SIMILAR_ISSUES_URL}.", - extra={"response_data": "No soup for you"}, - ) - mock_metrics_incr.assert_any_call( - "seer.similar_issues_request", - sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, - tags={"response_status": 403, "outcome": "error", "error": "RequestError"}, - ) + assert get_similarity_data_from_seer(self.request_params) == [] + mock_logger.error.assert_called_with( + f"Received {status} when calling Seer endpoint {SEER_SIMILAR_ISSUES_URL}.", + extra={"response_data": response}, + ) + mock_metrics_incr.assert_any_call( + "seer.similar_issues_request", + sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, + tags={"response_status": status, "outcome": "error", "error": "RequestError"}, + ) + assert mock_record_circuit_breaker_error.call_count == ( + 1 if counts_for_circuit_breaker else 0 + ) @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") def test_returns_sorted_results(self, mock_seer_request: MagicMock):