Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add exponential backoff when status returns 503 #1851

Merged
merged 2 commits into from
Jul 6, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions php_fpm/datadog_checks/php_fpm/php_fpm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# (C) Datadog, Inc. 2018
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)
import random
import time

import requests

from datadog_checks.checks import AgentCheck
Expand Down Expand Up @@ -73,17 +76,26 @@ def _process_status(self, status_url, auth, tags, http_host, timeout, disable_ss
try:
# TODO: adding the 'full' parameter gets you per-process detailed
# informations, which could be nice to parse and output as metrics
resp = requests.get(status_url,
auth=auth,
timeout=timeout,
headers=headers(self.agentConfig, http_host=http_host),
verify=not disable_ssl_validation,
params={'json': True})
resp.raise_for_status()

data = resp.json()
for i in range(3):
resp = requests.get(status_url, auth=auth, timeout=timeout,
headers=headers(self.agentConfig, http_host=http_host),
verify=not disable_ssl_validation, params={'json': True})

# Exponential backoff in case we get a 503 for at most 3 times.
# Delay in seconds is (attempt + random amount of seconds between 0 and 1)
# 503s originated here: https://github.com/php/php-src/blob/d84ef96/sapi/fpm/fpm/fpm_status.c#L96
if resp.status_code == 503 and i < 2:
# retry
time.sleep(i + 1 + random.random())
continue

resp.raise_for_status()
data = resp.json()

# successfully got a response, exit the backoff system
break
except Exception as e:
self.log.error("Failed to get metrics from {0}.\nError {1}".format(status_url, e))
self.log.error("Failed to get metrics from {}: {}".format(status_url, e))
raise

pool_name = data.get('pool', 'default')
Expand Down
24 changes: 24 additions & 0 deletions php_fpm/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess
import time
import sys
import json

import requests
from datadog_checks.php_fpm import PHPFPMCheck
Expand Down Expand Up @@ -66,3 +67,26 @@ def php_fpm_instance():
yield

subprocess.check_call(args + ["down"], env=env)


@pytest.fixture
def payload():
"""
example payload from /status?json
"""
return json.loads("""{
"pool":"www",
"process manager":"dynamic",
"start time":1530722898,
"start since":12,
"accepted conn":2,
"listen queue":0,
"max listen queue":0,
"listen queue len":128,
"idle processes":1,
"active processes":1,
"total processes":2,
"max active processes":1,
"max children reached":0,
"slow requests":0
}""")
64 changes: 47 additions & 17 deletions php_fpm/tests/test_php_fpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,13 @@
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)
import pytest
import mock

from datadog_checks.php_fpm.php_fpm import BadConfigError

# sample from /status?json
# {
# "pool":"www",
# "process manager":"dynamic",
# "start time":1530722898,
# "start since":12,
# "accepted conn":2,
# "listen queue":0,
# "max listen queue":0,
# "listen queue len":128,
# "idle processes":1,
# "active processes":1,
# "total processes":2,
# "max active processes":1,
# "max children reached":0,
# "slow requests":0
# }

class FooException(Exception):
pass


def test_bad_config(check):
Expand Down Expand Up @@ -78,3 +65,46 @@ def test_status(check, instance, aggregator, ping_url_tag, php_fpm_instance):

expected_tags = [ping_url_tag, 'cluster:forums']
aggregator.assert_service_check('php_fpm.can_ping', status=check.OK, tags=expected_tags)


def test_should_not_retry(check, instance):
"""
backoff only works when response code is 503, otherwise the error
should bubble up
"""
with mock.patch('datadog_checks.php_fpm.php_fpm.requests') as r:
r.get.side_effect = FooException("Generic http error here")
with pytest.raises(FooException):
check._process_status(instance['status_url'], None, [], None, 10, True)


def test_should_bail_out(check, instance):
"""
backoff should give up after 3 attempts
"""
with mock.patch('datadog_checks.php_fpm.php_fpm.requests') as r:
attrs = {'raise_for_status.side_effect': FooException()}
r.get.side_effect = [
mock.MagicMock(status_code=503, **attrs),
mock.MagicMock(status_code=503, **attrs),
mock.MagicMock(status_code=503, **attrs),
mock.MagicMock(status_code=200),
]
with pytest.raises(FooException):
check._process_status(instance['status_url'], None, [], None, 10, True)


def test_backoff_success(check, instance, aggregator, payload):
"""
Success after 2 failed attempts
"""
instance['ping_url'] = None
with mock.patch('datadog_checks.php_fpm.php_fpm.requests') as r:
attrs = {'json.return_value': payload}
r.get.side_effect = [
mock.MagicMock(status_code=503),
mock.MagicMock(status_code=503),
mock.MagicMock(status_code=200, **attrs),
]
pool_name = check._process_status(instance['status_url'], None, [], None, 10, True)
assert pool_name == 'www'