diff --git a/src/sentry/conf/server.py b/src/sentry/conf/server.py index fe139502dee4af..4d34481033cec0 100644 --- a/src/sentry/conf/server.py +++ b/src/sentry/conf/server.py @@ -3435,6 +3435,7 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]: SEER_HASH_GROUPING_RECORDS_DELETE_URL = ( f"/{SEER_SIMILARITY_MODEL_VERSION}/issues/similar-issues/grouping-record/delete-by-hash" ) +SEER_SIMILARITY_CIRCUIT_BREAKER_KEY = "seer.similarity" SIMILARITY_BACKFILL_COHORT_MAP: dict[str, list[int]] = {} diff --git a/src/sentry/event_manager.py b/src/sentry/event_manager.py index 95728d49b980ce..cd4db48a36ec81 100644 --- a/src/sentry/event_manager.py +++ b/src/sentry/event_manager.py @@ -129,9 +129,7 @@ from sentry.utils.circuit_breaker import ( ERROR_COUNT_CACHE_KEY, CircuitBreakerPassthrough, - CircuitBreakerTripped, circuit_breaker_activated, - with_circuit_breaker, ) from sentry.utils.dates import to_datetime from sentry.utils.event import has_event_minified_stack_trace, has_stacktrace, is_handled @@ -1538,13 +1536,18 @@ def _save_aggregate( seer_matched_group = None if should_call_seer_for_grouping(event, primary_hashes): + metrics.incr( + "grouping.similarity.did_call_seer", + # TODO: Consider lowering this (in all the spots this metric is + # collected) once we roll Seer grouping out more widely + sample_rate=1.0, + tags={"call_made": True, "blocker": "none"}, + ) try: # If the `projects:similarity-embeddings-grouping` feature is disabled, # we'll still get back result metadata, but `seer_matched_group` will be None - seer_response_data, seer_matched_group = with_circuit_breaker( - "event_manager.get_seer_similar_issues", - lambda: get_seer_similar_issues(event, primary_hashes), - options.get("seer.similarity.circuit-breaker-config"), + seer_response_data, seer_matched_group = get_seer_similar_issues( + event, primary_hashes ) event.data["seer_similarity"] = seer_response_data @@ -1555,33 +1558,8 @@ def _save_aggregate( "seer_similarity" ] = seer_response_data - metrics.incr( - "grouping.similarity.did_call_seer", - # TODO: Consider lowering this (in all the spots this metric is - # collected) once we roll Seer grouping out more widely - sample_rate=1.0, - tags={"call_made": True, "blocker": "none"}, - ) - - except CircuitBreakerTripped: - # TODO: Do we want to include all of the conditions which cause us to log a - # `grouping.similarity.seer_call_blocked` metric (here and in - # `should_call_seer_for_grouping`) under a single outcome tag on the span - # and timer metric below and in `record_calculation_metric_with_result` - # (also below)? Right now they just fall into the `new_group` bucket. - metrics.incr( - "grouping.similarity.did_call_seer", - sample_rate=1.0, - tags={"call_made": False, "blocker": "circuit-breaker"}, - ) - # Insurance - in theory we shouldn't ever land here except Exception as e: - metrics.incr( - "grouping.similarity.did_call_seer", - sample_rate=1.0, - tags={"call_made": True, "blocker": "none"}, - ) sentry_sdk.capture_exception( e, tags={"event": event.event_id, "project": project.id} ) diff --git a/src/sentry/grouping/ingest/seer.py b/src/sentry/grouping/ingest/seer.py index f8de83a22c0b73..f01145519dae0c 100644 --- a/src/sentry/grouping/ingest/seer.py +++ b/src/sentry/grouping/ingest/seer.py @@ -1,6 +1,8 @@ import logging from dataclasses import asdict +from django.conf import settings + from sentry import features, options from sentry import ratelimits as ratelimiter from sentry.eventstore.models import Event @@ -17,6 +19,7 @@ killswitch_enabled, ) from sentry.utils import metrics +from sentry.utils.circuit_breaker2 import CircuitBreaker from sentry.utils.safe import get_path logger = logging.getLogger("sentry.events.grouping") @@ -45,12 +48,11 @@ def should_call_seer_for_grouping(event: Event, primary_hashes: CalculatedHashes # (Checking the rate limit for calling Seer also increments the counter of how many times we've # tried to call it, and if we fail any of the other checks, it shouldn't count as an attempt. # Thus we only want to run the rate limit check if every other check has already succeeded.) - # - # Note: The circuit breaker check which might naturally be here alongside its killswitch - # and rate limiting friends instead happens in the `with_circuit_breaker` helper used where - # `get_seer_similar_issues` is actually called. (It has to be there in order for it to track - # errors arising from that call.) - if killswitch_enabled(project.id, event) or _ratelimiting_enabled(event, project): + if ( + killswitch_enabled(project.id, event) + or _circuit_breaker_broken(event, project) + or _ratelimiting_enabled(event, project) + ): return False return True @@ -155,6 +157,32 @@ def _ratelimiting_enabled(event: Event, project: Project) -> bool: return False +def _circuit_breaker_broken(event: Event, project: Project) -> bool: + breaker_config = options.get("seer.similarity.circuit-breaker-config") + circuit_breaker = CircuitBreaker(settings.SEER_SIMILARITY_CIRCUIT_BREAKER_KEY, breaker_config) + circuit_broken = not circuit_breaker.should_allow_request() + + if circuit_broken: + logger.warning( + "should_call_seer_for_grouping.broken_circuit_breaker", + extra={ + "event_id": event.event_id, + "project_id": project.id, + **breaker_config, + }, + ) + metrics.incr( + "grouping.similarity.broken_circuit_breaker", + ) + metrics.incr( + "grouping.similarity.did_call_seer", + sample_rate=1.0, + tags={"call_made": False, "blocker": "circuit-breaker"}, + ) + + return circuit_broken + + def get_seer_similar_issues( event: Event, primary_hashes: CalculatedHashes, diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 4426ae5b7d5014..65f1361143159f 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -892,12 +892,21 @@ flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE, ) +# TODO: The default error limit here was estimated based on EA traffic. (In an average 10 min +# period, there are roughly 35K events without matching hashes. About 2% of orgs are EA, so for +# simplicity, assume 2% of those events are from EA orgs. If we're willing to tolerate up to a 95% +# failure rate, then we need 35K * 0.02 * 0.95 events to fail to trip the breaker.) +# +# When we GA, we should multiply both the limits by 50 (to remove the 2% part of the current +# calculation), and remove this TODO. register( "seer.similarity.circuit-breaker-config", type=Dict, - # TODO: For now we're using the defaults for everything but `allow_passthrough`. We may want to - # revisit that choice in the future. - default={"allow_passthrough": True}, + default={ + "error_limit": 666, + "error_limit_window": 600, # 10 min + "broken_state_duration": 300, # 5 min + }, flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE, ) diff --git a/src/sentry/seer/similarity/similar_issues.py b/src/sentry/seer/similarity/similar_issues.py index f7b6fbd02df1d0..aa44b16bd3d6f4 100644 --- a/src/sentry/seer/similarity/similar_issues.py +++ b/src/sentry/seer/similarity/similar_issues.py @@ -3,7 +3,12 @@ from django.conf import settings from urllib3.exceptions import MaxRetryError, TimeoutError -from sentry.conf.server import SEER_MAX_GROUPING_DISTANCE, SEER_SIMILAR_ISSUES_URL +from sentry import options +from sentry.conf.server import ( + SEER_MAX_GROUPING_DISTANCE, + SEER_SIMILAR_ISSUES_URL, + SEER_SIMILARITY_CIRCUIT_BREAKER_KEY, +) from sentry.models.grouphash import GroupHash from sentry.net.http import connection_from_url from sentry.seer.signed_seer_api import make_signed_seer_api_request @@ -15,6 +20,7 @@ ) from sentry.tasks.delete_seer_grouping_records import delete_seer_grouping_records_by_hash from sentry.utils import json, metrics +from sentry.utils.circuit_breaker2 import CircuitBreaker from sentry.utils.json import JSONDecodeError, apply_key_filter logger = logging.getLogger(__name__) @@ -95,6 +101,11 @@ def get_similarity_data_from_seer( ) return [] + circuit_breaker = CircuitBreaker( + SEER_SIMILARITY_CIRCUIT_BREAKER_KEY, + options.get("seer.similarity.circuit-breaker-config"), + ) + try: response = make_signed_seer_api_request( seer_grouping_connection_pool, @@ -111,6 +122,7 @@ def get_similarity_data_from_seer( sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, tags={**metric_tags, "outcome": "error", "error": type(e).__name__}, ) + circuit_breaker.record_error() return [] metric_tags["response_status"] = response.status @@ -137,6 +149,9 @@ def get_similarity_data_from_seer( }, ) + if response.status >= 500: + circuit_breaker.record_error() + return [] try: diff --git a/tests/sentry/event_manager/grouping/test_seer_grouping.py b/tests/sentry/event_manager/grouping/test_seer_grouping.py index 8671da11790e1b..3180bbcd768ba5 100644 --- a/tests/sentry/event_manager/grouping/test_seer_grouping.py +++ b/tests/sentry/event_manager/grouping/test_seer_grouping.py @@ -10,7 +10,6 @@ from sentry.testutils.helpers.eventprocessing import save_new_event from sentry.testutils.helpers.features import with_feature from sentry.testutils.pytest.mocking import capture_results -from sentry.utils.circuit_breaker import with_circuit_breaker from sentry.utils.types import NonNone @@ -85,23 +84,6 @@ def test_obeys_seer_similarity_flags(self): assert get_seer_similar_issues_return_values[0][1] == existing_event.group assert new_event.group_id == existing_event.group_id - @patch("sentry.event_manager.should_call_seer_for_grouping", return_value=True) - @patch("sentry.event_manager.with_circuit_breaker", wraps=with_circuit_breaker) - @patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None)) - def test_obeys_circult_breaker( - self, mock_get_seer_similar_issues: MagicMock, mock_with_circuit_breaker: MagicMock, _ - ): - with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=True): - save_new_event({"message": "Dogs are great!"}, self.project) - assert mock_with_circuit_breaker.call_count == 1 - assert mock_get_seer_similar_issues.call_count == 1 - - with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=False): - save_new_event({"message": "Adopt don't shop"}, self.project) - - assert mock_with_circuit_breaker.call_count == 2 # increased - assert mock_get_seer_similar_issues.call_count == 1 # didn't increase - @patch("sentry.event_manager.should_call_seer_for_grouping", return_value=True) @patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None)) def test_calls_seer_if_no_group_found(self, mock_get_seer_similar_issues: MagicMock, _): diff --git a/tests/sentry/grouping/ingest/test_seer.py b/tests/sentry/grouping/ingest/test_seer.py index 9554fa2dc2832f..470fea3b596294 100644 --- a/tests/sentry/grouping/ingest/test_seer.py +++ b/tests/sentry/grouping/ingest/test_seer.py @@ -121,6 +121,18 @@ def test_obeys_project_ratelimit(self): is expected_result ) + @with_feature("projects:similarity-embeddings-grouping") + def test_obeys_circuit_breaker(self): + for request_allowed, expected_result in [(True, True), (False, False)]: + with patch( + "sentry.grouping.ingest.seer.CircuitBreaker.should_allow_request", + return_value=request_allowed, + ): + assert ( + should_call_seer_for_grouping(self.event, self.primary_hashes) + is expected_result + ) + @with_feature("projects:similarity-embeddings-grouping") def test_obeys_customized_fingerprint_check(self): default_fingerprint_event = Event( diff --git a/tests/sentry/seer/similarity/test_similar_issues.py b/tests/sentry/seer/similarity/test_similar_issues.py index 5efd172f635fde..694f13e0aaa784 100644 --- a/tests/sentry/seer/similarity/test_similar_issues.py +++ b/tests/sentry/seer/similarity/test_similar_issues.py @@ -96,9 +96,15 @@ def test_no_groups_found(self, mock_seer_request: MagicMock, mock_metrics_incr: tags={"response_status": 200, "outcome": "no_similar_groups"}, ) + @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error") @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr") @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") - def test_bad_response_data(self, mock_seer_request: MagicMock, mock_metrics_incr: MagicMock): + def test_bad_response_data( + self, + mock_seer_request: MagicMock, + mock_metrics_incr: MagicMock, + mock_record_circuit_breaker_error: MagicMock, + ): cases: list[tuple[Any, str]] = [ (None, "AttributeError"), ([], "AttributeError"), @@ -139,14 +145,20 @@ def test_bad_response_data(self, mock_seer_request: MagicMock, mock_metrics_incr sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, tags={"response_status": 200, "outcome": "error", "error": expected_error}, ) + assert mock_record_circuit_breaker_error.call_count == 0 mock_metrics_incr.reset_mock() + @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error") @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr") @mock.patch("sentry.seer.similarity.similar_issues.logger") @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") def test_redirect( - self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock + self, + mock_seer_request: MagicMock, + mock_logger: MagicMock, + mock_metrics_incr: MagicMock, + mock_record_circuit_breaker_error: MagicMock, ): mock_seer_request.return_value = HTTPResponse( status=308, headers={"location": "/new/and/improved/endpoint/"} @@ -161,12 +173,18 @@ def test_redirect( sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, tags={"response_status": 308, "outcome": "error", "error": "Redirect"}, ) + assert mock_record_circuit_breaker_error.call_count == 0 + @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error") @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr") @mock.patch("sentry.seer.similarity.similar_issues.logger") @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") def test_request_error( - self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock + self, + mock_seer_request: MagicMock, + mock_logger: MagicMock, + mock_metrics_incr: MagicMock, + mock_record_circuit_breaker_error: MagicMock, ): for request_error, expected_error_tag in [ (TimeoutError, "TimeoutError"), @@ -192,25 +210,42 @@ def test_request_error( sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, tags={"outcome": "error", "error": expected_error_tag}, ) + assert mock_record_circuit_breaker_error.call_count == 1 + + mock_logger.warning.reset_mock() + mock_metrics_incr.reset_mock() + mock_record_circuit_breaker_error.reset_mock() + @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error") @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr") @mock.patch("sentry.seer.similarity.similar_issues.logger") @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") def test_error_status( - self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock + self, + mock_seer_request: MagicMock, + mock_logger: MagicMock, + mock_metrics_incr: MagicMock, + mock_record_circuit_breaker_error: MagicMock, ): - mock_seer_request.return_value = HTTPResponse("No soup for you", status=403) + for response, status, counts_for_circuit_breaker in [ + ("No soup for you", 403, False), + ("No soup, period", 500, True), + ]: + mock_seer_request.return_value = HTTPResponse(response, status=status) - assert get_similarity_data_from_seer(self.request_params) == [] - mock_logger.error.assert_called_with( - f"Received 403 when calling Seer endpoint {SEER_SIMILAR_ISSUES_URL}.", - extra={"response_data": "No soup for you"}, - ) - mock_metrics_incr.assert_any_call( - "seer.similar_issues_request", - sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, - tags={"response_status": 403, "outcome": "error", "error": "RequestError"}, - ) + assert get_similarity_data_from_seer(self.request_params) == [] + mock_logger.error.assert_called_with( + f"Received {status} when calling Seer endpoint {SEER_SIMILAR_ISSUES_URL}.", + extra={"response_data": response}, + ) + mock_metrics_incr.assert_any_call( + "seer.similar_issues_request", + sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE, + tags={"response_status": status, "outcome": "error", "error": "RequestError"}, + ) + assert mock_record_circuit_breaker_error.call_count == ( + 1 if counts_for_circuit_breaker else 0 + ) @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") def test_returns_sorted_results(self, mock_seer_request: MagicMock):