Skip to content

Commit

Permalink
Add retries to test & emit warnings when connections fail (#12528)
Browse files Browse the repository at this point in the history
* Emit warning service check when connection is lost, add reconnection test
* Add service check for flakey connection
* Assert service check
  • Loading branch information
hithwen committed Jul 20, 2022
1 parent 296ef7e commit f3db5a5
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 0 deletions.
15 changes: 15 additions & 0 deletions sap_hana/datadog_checks/sap_hana/sap_hana.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def __init__(self, name, init_config, instances):

# Whether or not the connection was lost
self._connection_lost = False
self._connection_flaked = False

# Whether or not to persist database connection. Default is True
self._persist_db_connections = self.instance.get(
Expand Down Expand Up @@ -112,6 +113,9 @@ def check(self, _):
continue
finally:
if self._connection_lost:
self.service_check(
self.SERVICE_CHECK_CONNECT, self.WARNING, message="Lost connection to HANA server", tags=self._tags
)
try:
self._conn.close()
except OperationalError:
Expand All @@ -125,6 +129,14 @@ def check(self, _):
except OperationalError:
self.log.error("Could not close connection.")
self._conn = None
if self._connection_flaked:
self.service_check(
self.SERVICE_CHECK_CONNECT,
self.WARNING,
message="Session has been reconnected after an error",
tags=self._tags,
)
self._connection_flaked = False

def set_default_methods(self):
self._default_methods.extend(
Expand Down Expand Up @@ -595,6 +607,9 @@ def execute_query(self, cursor, query, source):
error = str(e)
if 'Lost connection to HANA server' in error:
self._connection_lost = True
if 'Session has been reconnected' in error:
# No need to attempt a reconnect in this case but some metrics will be missing
self._connection_flaked = True

raise QueryExecutionError(error, source)

Expand Down
11 changes: 11 additions & 0 deletions sap_hana/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os

from datadog_checks.dev import get_docker_hostname, get_here
from datadog_checks.sap_hana import SapHanaCheck

HERE = get_here()
COMPOSE_FILE = os.path.join(HERE, 'docker', 'docker-compose.yaml')
Expand All @@ -14,3 +15,13 @@
ADMIN_CONFIG = {'server': SERVER, 'port': PORT, 'username': 'system', 'password': 'Admin1337'}

E2E_METADATA = {'start_commands': ['pip install hdbcli==2.10.15']}

CAN_CONNECT_SERVICE_CHECK = 'sap_hana.{}'.format(SapHanaCheck.SERVICE_CHECK_CONNECT)


def connection_flaked(aggregator):
# HANA connection some times flakes, in that case the check will reconnect on next run
# And a warning service check will be emitted
service_checks = aggregator.service_checks(CAN_CONNECT_SERVICE_CHECK)
all_ok = all([service_check.status == SapHanaCheck.OK for service_check in service_checks])
return not all_ok
8 changes: 8 additions & 0 deletions sap_hana/tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,23 @@
# Licensed under a 3-clause BSD style license (see LICENSE)
import pytest

from datadog_checks.sap_hana import SapHanaCheck

from . import metrics
from .common import CAN_CONNECT_SERVICE_CHECK, connection_flaked

pytestmark = pytest.mark.e2e


@pytest.mark.e2e
def test_check(dd_agent_check, instance):
attempts = 3
aggregator = dd_agent_check(instance, rate=True)
while attempts and connection_flaked(aggregator):
aggregator = dd_agent_check(instance, rate=True)
attempts -= 1

aggregator.assert_service_check(CAN_CONNECT_SERVICE_CHECK, SapHanaCheck.OK)
for metric in metrics.STANDARD:
aggregator.assert_metric_has_tag(metric, 'server:{}'.format(instance['server']))
aggregator.assert_metric_has_tag(metric, 'port:{}'.format(instance['port']))
Expand Down
8 changes: 8 additions & 0 deletions sap_hana/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,23 @@
from datadog_checks.sap_hana import SapHanaCheck

from . import metrics
from .common import CAN_CONNECT_SERVICE_CHECK, connection_flaked

pytestmark = pytest.mark.integration


@pytest.mark.usefixtures('dd_environment')
def test_check(aggregator, instance, dd_run_check):
check = SapHanaCheck('sap_hana', {}, [instance])

attempts = 3
dd_run_check(check)
while attempts and connection_flaked(aggregator):
aggregator.reset()
dd_run_check(check)
attempts -= 1

aggregator.assert_service_check(CAN_CONNECT_SERVICE_CHECK, SapHanaCheck.OK)
for metric in metrics.STANDARD:
aggregator.assert_metric_has_tag(metric, 'server:{}'.format(instance['server']))
aggregator.assert_metric_has_tag(metric, 'port:{}'.format(instance['port']))
Expand Down
54 changes: 54 additions & 0 deletions sap_hana/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,60 @@ def test_error_query(instance, dd_run_check):
check.log.error.assert_any_call('Error querying %s: %s', 'SYS.M_DATABASE', 'test')


def test_reconnect_on_connection_failure(instance, dd_run_check, aggregator):
def connection_error(*args, **kwargs):
raise Exception('Lost connection to HANA server')

check = SapHanaCheck('sap_hana', {}, [instance])
check.log = mock.MagicMock()

conn = mock.MagicMock()
cursor = mock.MagicMock()
cursor.execute = connection_error
conn.cursor = lambda: cursor
check._conn = conn

dd_run_check(check)

aggregator.assert_no_duplicate_service_checks()
aggregator.assert_service_check("sap_hana.{}".format(SapHanaCheck.SERVICE_CHECK_CONNECT), SapHanaCheck.WARNING)
conn.close.assert_called()
check.log.error.assert_any_call('Error querying %s: %s', 'SYS.M_DATABASE', 'Lost connection to HANA server')

# Assert than a connection is reattempted
check.get_connection = mock.MagicMock()
check.get_connection.return_value = conn
dd_run_check(check)
check.get_connection.assert_called()


def test_emits_critical_service_check_when_connection_flakes(instance, dd_run_check, aggregator):
def connection_flakes(*args, **kwargs):
raise Exception('Session has been reconnected after an error')

check = SapHanaCheck('sap_hana', {}, [instance])
check.log = mock.MagicMock()

conn = mock.MagicMock()
cursor = mock.MagicMock()
cursor.execute = connection_flakes
conn.cursor = lambda: cursor
check._conn = conn

dd_run_check(check)

aggregator.assert_no_duplicate_service_checks()
aggregator.assert_service_check("sap_hana.{}".format(SapHanaCheck.SERVICE_CHECK_CONNECT), SapHanaCheck.WARNING)
check.log.error.assert_any_call(
'Error querying %s: %s', 'SYS.M_DATABASE', 'Session has been reconnected after an error'
)

# Assert than a connection is not reattempted
check.get_connection = mock.MagicMock()
dd_run_check(check)
check.get_connection.assert_not_called()


def test_error_unknown(instance, dd_run_check):
def query_master_database():
error()
Expand Down

0 comments on commit f3db5a5

Please sign in to comment.