From 100daf474e07cc900b4ce7621d1cc34615db9b2c Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 6 Jul 2018 10:05:52 +0200 Subject: [PATCH] Add exponential backoff when status returns 503 (#1851) --- php_fpm/datadog_checks/php_fpm/php_fpm.py | 33 ++++++++---- php_fpm/tests/conftest.py | 24 +++++++++ php_fpm/tests/test_php_fpm.py | 64 +++++++++++++++++------ 3 files changed, 94 insertions(+), 27 deletions(-) diff --git a/php_fpm/datadog_checks/php_fpm/php_fpm.py b/php_fpm/datadog_checks/php_fpm/php_fpm.py index 175e75246da89..c0aa52b8061ae 100644 --- a/php_fpm/datadog_checks/php_fpm/php_fpm.py +++ b/php_fpm/datadog_checks/php_fpm/php_fpm.py @@ -1,6 +1,9 @@ # (C) Datadog, Inc. 2018 # All rights reserved # Licensed under Simplified BSD License (see LICENSE) +import random +import time + import requests from datadog_checks.checks import AgentCheck @@ -73,17 +76,27 @@ def _process_status(self, status_url, auth, tags, http_host, timeout, disable_ss try: # TODO: adding the 'full' parameter gets you per-process detailed # informations, which could be nice to parse and output as metrics - resp = requests.get(status_url, - auth=auth, - timeout=timeout, - headers=headers(self.agentConfig, http_host=http_host), - verify=not disable_ssl_validation, - params={'json': True}) - resp.raise_for_status() - - data = resp.json() + max_attempts = 3 + for i in range(max_attempts): + resp = requests.get(status_url, auth=auth, timeout=timeout, + headers=headers(self.agentConfig, http_host=http_host), + verify=not disable_ssl_validation, params={'json': True}) + + # Exponential backoff, wait at most (max_attempts - 1) times in case we get a 503. + # Delay in seconds is (2^i + random amount of seconds between 0 and 1) + # 503s originated here: https://github.com/php/php-src/blob/d84ef96/sapi/fpm/fpm/fpm_status.c#L96 + if resp.status_code == 503 and i < max_attempts - 1: + # retry + time.sleep(2**i + random.random()) + continue + + resp.raise_for_status() + data = resp.json() + + # successfully got a response, exit the backoff system + break except Exception as e: - self.log.error("Failed to get metrics from {0}.\nError {1}".format(status_url, e)) + self.log.error("Failed to get metrics from {}: {}".format(status_url, e)) raise pool_name = data.get('pool', 'default') diff --git a/php_fpm/tests/conftest.py b/php_fpm/tests/conftest.py index 73d759f2c5f58..36cec315469ab 100644 --- a/php_fpm/tests/conftest.py +++ b/php_fpm/tests/conftest.py @@ -6,6 +6,7 @@ import subprocess import time import sys +import json import requests from datadog_checks.php_fpm import PHPFPMCheck @@ -66,3 +67,26 @@ def php_fpm_instance(): yield subprocess.check_call(args + ["down"], env=env) + + +@pytest.fixture +def payload(): + """ + example payload from /status?json + """ + return json.loads("""{ + "pool":"www", + "process manager":"dynamic", + "start time":1530722898, + "start since":12, + "accepted conn":2, + "listen queue":0, + "max listen queue":0, + "listen queue len":128, + "idle processes":1, + "active processes":1, + "total processes":2, + "max active processes":1, + "max children reached":0, + "slow requests":0 + }""") diff --git a/php_fpm/tests/test_php_fpm.py b/php_fpm/tests/test_php_fpm.py index f565d5824c730..1ffc4eca5f4c4 100644 --- a/php_fpm/tests/test_php_fpm.py +++ b/php_fpm/tests/test_php_fpm.py @@ -2,26 +2,13 @@ # All rights reserved # Licensed under Simplified BSD License (see LICENSE) import pytest +import mock from datadog_checks.php_fpm.php_fpm import BadConfigError -# sample from /status?json -# { -# "pool":"www", -# "process manager":"dynamic", -# "start time":1530722898, -# "start since":12, -# "accepted conn":2, -# "listen queue":0, -# "max listen queue":0, -# "listen queue len":128, -# "idle processes":1, -# "active processes":1, -# "total processes":2, -# "max active processes":1, -# "max children reached":0, -# "slow requests":0 -# } + +class FooException(Exception): + pass def test_bad_config(check): @@ -78,3 +65,46 @@ def test_status(check, instance, aggregator, ping_url_tag, php_fpm_instance): expected_tags = [ping_url_tag, 'cluster:forums'] aggregator.assert_service_check('php_fpm.can_ping', status=check.OK, tags=expected_tags) + + +def test_should_not_retry(check, instance): + """ + backoff only works when response code is 503, otherwise the error + should bubble up + """ + with mock.patch('datadog_checks.php_fpm.php_fpm.requests') as r: + r.get.side_effect = FooException("Generic http error here") + with pytest.raises(FooException): + check._process_status(instance['status_url'], None, [], None, 10, True) + + +def test_should_bail_out(check, instance): + """ + backoff should give up after 3 attempts + """ + with mock.patch('datadog_checks.php_fpm.php_fpm.requests') as r: + attrs = {'raise_for_status.side_effect': FooException()} + r.get.side_effect = [ + mock.MagicMock(status_code=503, **attrs), + mock.MagicMock(status_code=503, **attrs), + mock.MagicMock(status_code=503, **attrs), + mock.MagicMock(status_code=200), + ] + with pytest.raises(FooException): + check._process_status(instance['status_url'], None, [], None, 10, True) + + +def test_backoff_success(check, instance, aggregator, payload): + """ + Success after 2 failed attempts + """ + instance['ping_url'] = None + with mock.patch('datadog_checks.php_fpm.php_fpm.requests') as r: + attrs = {'json.return_value': payload} + r.get.side_effect = [ + mock.MagicMock(status_code=503), + mock.MagicMock(status_code=503), + mock.MagicMock(status_code=200, **attrs), + ] + pool_name = check._process_status(instance['status_url'], None, [], None, 10, True) + assert pool_name == 'www'