Skip to content

Commit

Permalink
Collect liveness and readiness probe metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmed-mez committed Mar 16, 2022
1 parent bba64f0 commit 432ff86
Show file tree
Hide file tree
Showing 12 changed files with 3,566 additions and 62 deletions.
5 changes: 5 additions & 0 deletions kubelet/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ files:
URL of the kubelet metrics prometheus endpoint
Pass an empty string to disable kubelet metrics collection.
example: http://10.8.0.1:10255/metrics
- name: probes_metrics_endpoint
description: |
URL of the probe metrics prometheus endpoint
Pass an empty string to disable probe metrics collection.
example: http://10.8.0.1:10255/metrics/probes
- name: cadvisor_port
description: |
Metric collection for legacy (< 1.7.6) clusters via the kubelet's cadvisor port.
Expand Down
30 changes: 30 additions & 0 deletions kubelet/datadog_checks/kubelet/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,17 @@ def replace_container_rt_prefix(cid):
return cid


def get_container_label(labels, l_name):
"""
Iter on all labels to find the label.name equal to the l_name
:param labels: list of labels
:param l_name: str
:return: str or None
"""
if l_name in labels:
return labels[l_name]


class PodListUtils(object):
"""
Queries the podlist and the agent6's filtering logic to determine whether to
Expand Down Expand Up @@ -208,3 +219,22 @@ def is_namespace_excluded(self, namespace):
excluded = c_is_excluded('', '', namespace)
self.cache_namespace_exclusion[namespace] = excluded
return excluded

def get_cid_by_labels(self, labels):
"""
Should only be called on a container-scoped metric
It gets the container id from the podlist using the metrics labels
:param labels
:return str or None
"""
namespace = get_container_label(labels, "namespace")
# k8s >= 1.16
pod_name = get_container_label(labels, "pod")
container_name = get_container_label(labels, "container")
# k8s < 1.16
if not pod_name:
pod_name = get_container_label(labels, "pod_name")
if not container_name:
container_name = get_container_label(labels, "container_name")
return self.get_cid_by_name_tuple((namespace, pod_name, container_name))
5 changes: 5 additions & 0 deletions kubelet/datadog_checks/kubelet/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ instances:
#
# kubelet_metrics_endpoint: http://10.8.0.1:10255/metrics

## URL of the probe metrics prometheus endpoint
## Pass an empty string to disable probe metrics collection.
#
# probes_metrics_endpoint: http://10.8.0.1:10255/metrics/probes

## Metric collection for legacy (< 1.7.6) clusters via the kubelet's cadvisor port.
## This port is closed by default on k8s 1.7+ and OpenShift, enable it
## via the `--cadvisor-port=4194` kubelet option.
Expand Down
18 changes: 17 additions & 1 deletion kubelet/datadog_checks/kubelet/kubelet.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from .cadvisor import CadvisorScraper
from .common import CADVISOR_DEFAULT_PORT, PodListUtils, replace_container_rt_prefix
from .prometheus import CadvisorPrometheusScraperMixin
from .probes import ProbesPrometheusScraperMixin
from .summary import SummaryScraperMixin

KUBELET_HEALTH_PATH = '/healthz'
Expand All @@ -29,6 +30,7 @@
CADVISOR_METRICS_PATH = '/metrics/cadvisor'
KUBELET_METRICS_PATH = '/metrics'
STATS_PATH = '/stats/summary/'
PROBES_METRICS_PATH = '/metrics/probes'

# Suffixes per
# https://github.com/kubernetes/kubernetes/blob/8fd414537b5143ab039cb910590237cabf4af783/pkg/api/resource/suffix.go#L108
Expand Down Expand Up @@ -128,6 +130,7 @@ class KubeletCheck(
OpenMetricsBaseCheck,
CadvisorScraper,
SummaryScraperMixin,
ProbesPrometheusScraperMixin,
KubeletBase,
):
"""
Expand Down Expand Up @@ -173,7 +176,8 @@ def __init__(self, name, init_config, instances):
self.pod_level_metrics = ["{0}.{1}".format(self.NAMESPACE, x) for x in pod_level_metrics]

kubelet_instance = self._create_kubelet_prometheus_instance(inst)
generic_instances = [cadvisor_instance, kubelet_instance]
probes_instance = self._create_probes_prometheus_instance(inst)
generic_instances = [cadvisor_instance, kubelet_instance, probes_instance]
super(KubeletCheck, self).__init__(name, init_config, generic_instances)

self.cadvisor_legacy_port = inst.get('cadvisor_port', CADVISOR_DEFAULT_PORT)
Expand All @@ -189,6 +193,8 @@ def __init__(self, name, init_config, instances):

self.kubelet_scraper_config = self.get_scraper_config(kubelet_instance)

self.probes_scraper_config = self.get_scraper_config(probes_instance)

counter_transformers = {k: self.send_always_counter for k in self.COUNTER_METRICS}

histogram_transformers = {
Expand All @@ -199,6 +205,7 @@ def __init__(self, name, init_config, instances):

self.transformers = {}
for d in [
self.PROBES_METRIC_TRANSFORMERS,
self.CADVISOR_METRIC_TRANSFORMERS,
counter_transformers,
histogram_transformers,
Expand Down Expand Up @@ -320,9 +327,14 @@ def check(self, instance):
'kubelet_metrics_endpoint', urljoin(endpoint, KUBELET_METRICS_PATH)
)

self.probes_scraper_config['prometheus_url'] = instance.get(
'probes_metrics_endpoint', urljoin(endpoint, PROBES_METRICS_PATH)
)

# Kubelet credentials handling
self.kubelet_credentials.configure_scraper(self.cadvisor_scraper_config)
self.kubelet_credentials.configure_scraper(self.kubelet_scraper_config)
self.kubelet_credentials.configure_scraper(self.probes_scraper_config)

# Legacy cadvisor support
try:
Expand Down Expand Up @@ -356,6 +368,10 @@ def check(self, instance):
self.log.debug('processing kubelet metrics')
self.process(self.kubelet_scraper_config, metric_transformers=self.transformers)

if self.probes_scraper_config['prometheus_url']:
self.log.debug('processing probe metrics')
self.process(self.probes_scraper_config, metric_transformers=self.transformers)

self.first_run = False

# Free up memory
Expand Down
94 changes: 94 additions & 0 deletions kubelet/datadog_checks/kubelet/probes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# (C) Datadog, Inc. 2018-present
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)
from __future__ import division

from copy import deepcopy

from datadog_checks.base.checks.openmetrics import OpenMetricsBaseCheck
from datadog_checks.base.utils.tagging import tagger

from .common import get_container_label, replace_container_rt_prefix, tags_for_docker


class ProbesPrometheusScraperMixin(object):
"""
This class scrapes metrics for the kubelet "/metrics/probes" prometheus endpoint and submits
them on behalf of a check.
"""

def __init__(self, *args, **kwargs):
super(ProbesPrometheusScraperMixin, self).__init__(*args, **kwargs)

self.PROBES_METRIC_TRANSFORMERS = {
'prober_probe_total': self.prober_probe_total,
}

def _create_probes_prometheus_instance(self, instance):
"""
Create a copy of the instance and set default values.
This is so the base class can create a scraper_config with the proper values.
"""
probes_instance = deepcopy(instance)
probes_instance.update(
{
'namespace': self.NAMESPACE,
# We need to specify a prometheus_url so the base class can use it as the key for our config_map,
# we specify a dummy url that will be replaced in the `check()` function. We append it with "probes"
# so the key is different than the rest of the kubelet scrapers.
'prometheus_url': instance.get('probes_metrics_endpoint', 'dummy_url/probes'),
}
)
return probes_instance

def prober_probe_total(self, metric, scraper_config):
for sample in metric.samples:
metric_name_suffix = ''
labels = sample[OpenMetricsBaseCheck.SAMPLE_LABELS]

probe_type = labels.get('probe_type')
if probe_type == 'Liveness':
metric_name_suffix = '.liveness_probe'
elif probe_type == 'Readiness':
metric_name_suffix = '.readiness_probe'
else:
self.log.debug("Unsupported probe type %s", probe_type)
continue

result = labels.get('result')
if result == 'successful':
metric_name_suffix = metric_name_suffix + '.success.total'
elif result == 'failed':
metric_name_suffix = metric_name_suffix + '.failure.total'
elif result == 'unknown':
metric_name_suffix = metric_name_suffix + '.unknown.total'
else:
self.log.debug("Unsupported probe result %s", result)
continue

metric_name = scraper_config['namespace'] + metric_name_suffix

container_id = self.pod_list_utils.get_cid_by_labels(labels)
if container_id is None:
self.log.debug(
"Container id not found from /pods for container: %s/%s/%s - no metrics will be sent",
get_container_label(labels, 'namespace'),
get_container_label(labels, 'pod'),
get_container_label(labels, 'container'),
)
continue

if self.pod_list_utils.is_excluded(container_id):
continue

container_tags = tags_for_docker(replace_container_rt_prefix(container_id), tagger.HIGH, True)
if not container_tags:
self.log.debug(
"Tags not found for container: %s/%s/%s:%s - no metrics will be sent",
get_container_label(labels, 'namespace'),
get_container_label(labels, 'pod'),
get_container_label(labels, 'container'),
container_id,
)

self.count(metric_name, sample[self.SAMPLE_VALUE], container_tags+self.instance_tags)
48 changes: 9 additions & 39 deletions kubelet/datadog_checks/kubelet/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from datadog_checks.base.checks.openmetrics import OpenMetricsBaseCheck
from datadog_checks.base.utils.tagging import tagger

from .common import get_pod_by_uid, is_static_pending_pod, replace_container_rt_prefix
from .common import get_pod_by_uid, is_static_pending_pod, replace_container_rt_prefix, get_container_label

METRIC_TYPES = ['counter', 'gauge', 'summary']

Expand Down Expand Up @@ -146,36 +146,6 @@ def _is_pod_metric(labels):
return True
return False

@staticmethod
def _get_container_label(labels, l_name):
"""
Iter on all labels to find the label.name equal to the l_name
:param labels: list of labels
:param l_name: str
:return: str or None
"""
if l_name in labels:
return labels[l_name]

def _get_container_id(self, labels):
"""
Should only be called on a container-scoped metric
It gets the container id from the podlist using the metrics labels
:param labels
:return str or None
"""
namespace = CadvisorPrometheusScraperMixin._get_container_label(labels, "namespace")
# k8s >= 1.16
pod_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "pod")
container_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "container")
# k8s < 1.16
if not pod_name:
pod_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "pod_name")
if not container_name:
container_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "container_name")
return self.pod_list_utils.get_cid_by_name_tuple((namespace, pod_name, container_name))

def _get_entity_id_if_container_metric(self, labels):
"""
Checks the labels indicate a container metric,
Expand All @@ -190,19 +160,19 @@ def _get_entity_id_if_container_metric(self, labels):
# If the pod is static, ContainerStatus is unavailable.
# Return the pod UID so that we can collect metrics from it later on.
return self._get_pod_uid(labels)
return self._get_container_id(labels)
return self.pod_list_utils.get_cid_by_labels(labels)

def _get_pod_uid(self, labels):
"""
Return the id of a pod
:param labels:
:return: str or None
"""
namespace = CadvisorPrometheusScraperMixin._get_container_label(labels, "namespace")
pod_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "pod")
namespace = get_container_label(labels, "namespace")
pod_name = get_container_label(labels, "pod")
# k8s < 1.16
if not pod_name:
pod_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "pod_name")
pod_name = get_container_label(labels, "pod_name")
return self.pod_list_utils.get_uid_by_name_tuple((namespace, pod_name))

def _get_pod_uid_if_pod_metric(self, labels):
Expand Down Expand Up @@ -246,10 +216,10 @@ def _get_kube_container_name(labels):
:param labels: metric labels: iterable
:return: list
"""
container_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "container")
container_name = get_container_label(labels, "container")
# k8s < 1.16
if not container_name:
container_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "container_name")
container_name = get_container_label(labels, "container_name")
if container_name:
return ["kube_container_name:%s" % container_name]
return []
Expand Down Expand Up @@ -376,7 +346,7 @@ def _process_usage_metric(self, m_name, metric, cache, scraper_config, labels=No

samples = self._sum_values_by_context(metric, self._get_entity_id_if_container_metric)
for c_id, sample in iteritems(samples):
c_name = self._get_container_label(sample[self.SAMPLE_LABELS], 'name')
c_name = get_container_label(sample[self.SAMPLE_LABELS], 'name')
if not c_name:
continue
pod_uid = self._get_pod_uid(sample[self.SAMPLE_LABELS])
Expand Down Expand Up @@ -436,7 +406,7 @@ def _process_limit_metric(self, m_name, metric, cache, scraper_config, pct_m_nam
self.gauge(m_name, limit, tags)

if pct_m_name and limit > 0:
c_name = self._get_container_label(sample[self.SAMPLE_LABELS], 'name')
c_name = get_container_label(sample[self.SAMPLE_LABELS], 'name')
if not c_name:
continue
usage, tags = cache.get(c_name, (None, None))
Expand Down
4 changes: 4 additions & 0 deletions kubelet/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,7 @@ kubernetes.kubelet.docker.operations.duration.sum,gauge,,operation,,The sum of d
kubernetes.kubelet.docker.operations.duration.count,gauge,,,,The count of docker operations,0,kubernetes,k8s.docker.duration,
kubernetes.go_threads,gauge,,,,Number of OS threads created,0,kubernetes,k8s.go.threads,
kubernetes.go_goroutines,gauge,,,,Number of goroutines that currently exist,0,kubernetes,k8s.go.goroutines,
kubernetes.liveness_probe.success.total,count,,,,Cumulative number of successful liveness probe for a container (ALPHA in kubernetes v1.15),-1,kubernetes,k8s.liveness_probe.success.total,
kubernetes.liveness_probe.failure.total,count,,,,Cumulative number of failed liveness probe for a container (ALPHA in kubernetes v1.15),-1,kubernetes,k8s.liveness_probe.failure.total,
kubernetes.readiness_probe.success.total,count,,,,Cumulative number of successful readiness probe for a container (ALPHA in kubernetes v1.15),-1,kubernetes,k8s.liveness_probe.success.total,
kubernetes.readiness_probe.failure.total,count,,,,Cumulative number of failed readiness probe for a container (ALPHA in kubernetes v1.15),-1,kubernetes,k8s.liveness_probe.failure.total,
Loading

0 comments on commit 432ff86

Please sign in to comment.