Skip to content

Commit

Permalink
Add exponential backoff when status returns 503 (#1851)
Browse files Browse the repository at this point in the history
  • Loading branch information
masci committed Jul 6, 2018
1 parent cb0824f commit 100daf4
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 27 deletions.
33 changes: 23 additions & 10 deletions php_fpm/datadog_checks/php_fpm/php_fpm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# (C) Datadog, Inc. 2018
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)
import random
import time

import requests

from datadog_checks.checks import AgentCheck
Expand Down Expand Up @@ -73,17 +76,27 @@ def _process_status(self, status_url, auth, tags, http_host, timeout, disable_ss
try:
# TODO: adding the 'full' parameter gets you per-process detailed
# informations, which could be nice to parse and output as metrics
resp = requests.get(status_url,
auth=auth,
timeout=timeout,
headers=headers(self.agentConfig, http_host=http_host),
verify=not disable_ssl_validation,
params={'json': True})
resp.raise_for_status()

data = resp.json()
max_attempts = 3
for i in range(max_attempts):
resp = requests.get(status_url, auth=auth, timeout=timeout,
headers=headers(self.agentConfig, http_host=http_host),
verify=not disable_ssl_validation, params={'json': True})

# Exponential backoff, wait at most (max_attempts - 1) times in case we get a 503.
# Delay in seconds is (2^i + random amount of seconds between 0 and 1)
# 503s originated here: https://github.com/php/php-src/blob/d84ef96/sapi/fpm/fpm/fpm_status.c#L96
if resp.status_code == 503 and i < max_attempts - 1:
# retry
time.sleep(2**i + random.random())
continue

resp.raise_for_status()
data = resp.json()

# successfully got a response, exit the backoff system
break
except Exception as e:
self.log.error("Failed to get metrics from {0}.\nError {1}".format(status_url, e))
self.log.error("Failed to get metrics from {}: {}".format(status_url, e))
raise

pool_name = data.get('pool', 'default')
Expand Down
24 changes: 24 additions & 0 deletions php_fpm/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess
import time
import sys
import json

import requests
from datadog_checks.php_fpm import PHPFPMCheck
Expand Down Expand Up @@ -66,3 +67,26 @@ def php_fpm_instance():
yield

subprocess.check_call(args + ["down"], env=env)


@pytest.fixture
def payload():
"""
example payload from /status?json
"""
return json.loads("""{
"pool":"www",
"process manager":"dynamic",
"start time":1530722898,
"start since":12,
"accepted conn":2,
"listen queue":0,
"max listen queue":0,
"listen queue len":128,
"idle processes":1,
"active processes":1,
"total processes":2,
"max active processes":1,
"max children reached":0,
"slow requests":0
}""")
64 changes: 47 additions & 17 deletions php_fpm/tests/test_php_fpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,13 @@
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)
import pytest
import mock

from datadog_checks.php_fpm.php_fpm import BadConfigError

# sample from /status?json
# {
# "pool":"www",
# "process manager":"dynamic",
# "start time":1530722898,
# "start since":12,
# "accepted conn":2,
# "listen queue":0,
# "max listen queue":0,
# "listen queue len":128,
# "idle processes":1,
# "active processes":1,
# "total processes":2,
# "max active processes":1,
# "max children reached":0,
# "slow requests":0
# }

class FooException(Exception):
pass


def test_bad_config(check):
Expand Down Expand Up @@ -78,3 +65,46 @@ def test_status(check, instance, aggregator, ping_url_tag, php_fpm_instance):

expected_tags = [ping_url_tag, 'cluster:forums']
aggregator.assert_service_check('php_fpm.can_ping', status=check.OK, tags=expected_tags)


def test_should_not_retry(check, instance):
"""
backoff only works when response code is 503, otherwise the error
should bubble up
"""
with mock.patch('datadog_checks.php_fpm.php_fpm.requests') as r:
r.get.side_effect = FooException("Generic http error here")
with pytest.raises(FooException):
check._process_status(instance['status_url'], None, [], None, 10, True)


def test_should_bail_out(check, instance):
"""
backoff should give up after 3 attempts
"""
with mock.patch('datadog_checks.php_fpm.php_fpm.requests') as r:
attrs = {'raise_for_status.side_effect': FooException()}
r.get.side_effect = [
mock.MagicMock(status_code=503, **attrs),
mock.MagicMock(status_code=503, **attrs),
mock.MagicMock(status_code=503, **attrs),
mock.MagicMock(status_code=200),
]
with pytest.raises(FooException):
check._process_status(instance['status_url'], None, [], None, 10, True)


def test_backoff_success(check, instance, aggregator, payload):
"""
Success after 2 failed attempts
"""
instance['ping_url'] = None
with mock.patch('datadog_checks.php_fpm.php_fpm.requests') as r:
attrs = {'json.return_value': payload}
r.get.side_effect = [
mock.MagicMock(status_code=503),
mock.MagicMock(status_code=503),
mock.MagicMock(status_code=200, **attrs),
]
pool_name = check._process_status(instance['status_url'], None, [], None, 10, True)
assert pool_name == 'www'

0 comments on commit 100daf4

Please sign in to comment.