Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect liveness and readiness probe metrics #11682

Merged
merged 3 commits into from
Mar 22, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions kubelet/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ files:
URL of the kubelet metrics prometheus endpoint
Pass an empty string to disable kubelet metrics collection.
example: http://10.8.0.1:10255/metrics
- name: probes_metrics_endpoint
description: |
URL of the probe metrics prometheus endpoint
Pass an empty string to disable probe metrics collection.
example: http://10.8.0.1:10255/metrics/probes
- name: cadvisor_port
description: |
Metric collection for legacy (< 1.7.6) clusters via the kubelet's cadvisor port.
Expand Down
30 changes: 30 additions & 0 deletions kubelet/datadog_checks/kubelet/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,17 @@ def replace_container_rt_prefix(cid):
return cid


def get_container_label(labels, l_name):
"""
Iter on all labels to find the label.name equal to the l_name
:param labels: list of labels
:param l_name: str
:return: str or None
"""
if l_name in labels:
return labels[l_name]
ahmed-mez marked this conversation as resolved.
Show resolved Hide resolved


class PodListUtils(object):
"""
Queries the podlist and the agent6's filtering logic to determine whether to
Expand Down Expand Up @@ -208,3 +219,22 @@ def is_namespace_excluded(self, namespace):
excluded = c_is_excluded('', '', namespace)
self.cache_namespace_exclusion[namespace] = excluded
return excluded

def get_cid_by_labels(self, labels):
"""
Should only be called on a container-scoped metric
It gets the container id from the podlist using the metrics labels
:param labels
:return str or None
"""
namespace = get_container_label(labels, "namespace")
# k8s >= 1.16
pod_name = get_container_label(labels, "pod")
container_name = get_container_label(labels, "container")
# k8s < 1.16
if not pod_name:
pod_name = get_container_label(labels, "pod_name")
if not container_name:
container_name = get_container_label(labels, "container_name")
return self.get_cid_by_name_tuple((namespace, pod_name, container_name))
5 changes: 5 additions & 0 deletions kubelet/datadog_checks/kubelet/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ instances:
#
# kubelet_metrics_endpoint: http://10.8.0.1:10255/metrics

## URL of the probe metrics prometheus endpoint
## Pass an empty string to disable probe metrics collection.
#
# probes_metrics_endpoint: http://10.8.0.1:10255/metrics/probes

## Metric collection for legacy (< 1.7.6) clusters via the kubelet's cadvisor port.
## This port is closed by default on k8s 1.7+ and OpenShift, enable it
## via the `--cadvisor-port=4194` kubelet option.
Expand Down
18 changes: 17 additions & 1 deletion kubelet/datadog_checks/kubelet/kubelet.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from .cadvisor import CadvisorScraper
from .common import CADVISOR_DEFAULT_PORT, PodListUtils, replace_container_rt_prefix
from .probes import ProbesPrometheusScraperMixin
from .prometheus import CadvisorPrometheusScraperMixin
from .summary import SummaryScraperMixin

Expand All @@ -29,6 +30,7 @@
CADVISOR_METRICS_PATH = '/metrics/cadvisor'
KUBELET_METRICS_PATH = '/metrics'
STATS_PATH = '/stats/summary/'
PROBES_METRICS_PATH = '/metrics/probes'

# Suffixes per
# https://github.com/kubernetes/kubernetes/blob/8fd414537b5143ab039cb910590237cabf4af783/pkg/api/resource/suffix.go#L108
Expand Down Expand Up @@ -128,6 +130,7 @@ class KubeletCheck(
OpenMetricsBaseCheck,
CadvisorScraper,
SummaryScraperMixin,
ProbesPrometheusScraperMixin,
KubeletBase,
):
"""
Expand Down Expand Up @@ -173,7 +176,8 @@ def __init__(self, name, init_config, instances):
self.pod_level_metrics = ["{0}.{1}".format(self.NAMESPACE, x) for x in pod_level_metrics]

kubelet_instance = self._create_kubelet_prometheus_instance(inst)
generic_instances = [cadvisor_instance, kubelet_instance]
probes_instance = self._create_probes_prometheus_instance(inst)
generic_instances = [cadvisor_instance, kubelet_instance, probes_instance]
super(KubeletCheck, self).__init__(name, init_config, generic_instances)

self.cadvisor_legacy_port = inst.get('cadvisor_port', CADVISOR_DEFAULT_PORT)
Expand All @@ -189,6 +193,8 @@ def __init__(self, name, init_config, instances):

self.kubelet_scraper_config = self.get_scraper_config(kubelet_instance)

self.probes_scraper_config = self.get_scraper_config(probes_instance)

counter_transformers = {k: self.send_always_counter for k in self.COUNTER_METRICS}

histogram_transformers = {
Expand All @@ -199,6 +205,7 @@ def __init__(self, name, init_config, instances):

self.transformers = {}
for d in [
self.PROBES_METRIC_TRANSFORMERS,
self.CADVISOR_METRIC_TRANSFORMERS,
counter_transformers,
histogram_transformers,
Expand Down Expand Up @@ -320,9 +327,14 @@ def check(self, instance):
'kubelet_metrics_endpoint', urljoin(endpoint, KUBELET_METRICS_PATH)
)

self.probes_scraper_config['prometheus_url'] = instance.get(
'probes_metrics_endpoint', urljoin(endpoint, PROBES_METRICS_PATH)
)

# Kubelet credentials handling
self.kubelet_credentials.configure_scraper(self.cadvisor_scraper_config)
self.kubelet_credentials.configure_scraper(self.kubelet_scraper_config)
self.kubelet_credentials.configure_scraper(self.probes_scraper_config)

# Legacy cadvisor support
try:
Expand Down Expand Up @@ -356,6 +368,10 @@ def check(self, instance):
self.log.debug('processing kubelet metrics')
self.process(self.kubelet_scraper_config, metric_transformers=self.transformers)

if self.probes_scraper_config['prometheus_url']:
self.log.debug('processing probe metrics')
self.process(self.probes_scraper_config, metric_transformers=self.transformers)

self.first_run = False

# Free up memory
Expand Down
94 changes: 94 additions & 0 deletions kubelet/datadog_checks/kubelet/probes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# (C) Datadog, Inc. 2018-present
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)
from __future__ import division

from copy import deepcopy

from datadog_checks.base.checks.openmetrics import OpenMetricsBaseCheck
from datadog_checks.base.utils.tagging import tagger

from .common import get_container_label, replace_container_rt_prefix, tags_for_docker


class ProbesPrometheusScraperMixin(object):
"""
This class scrapes metrics for the kubelet "/metrics/probes" prometheus endpoint and submits
them on behalf of a check.
"""

def __init__(self, *args, **kwargs):
super(ProbesPrometheusScraperMixin, self).__init__(*args, **kwargs)

self.PROBES_METRIC_TRANSFORMERS = {
'prober_probe_total': self.prober_probe_total,
}

def _create_probes_prometheus_instance(self, instance):
"""
Create a copy of the instance and set default values.
This is so the base class can create a scraper_config with the proper values.
"""
probes_instance = deepcopy(instance)
probes_instance.update(
{
'namespace': self.NAMESPACE,
# We need to specify a prometheus_url so the base class can use it as the key for our config_map,
# we specify a dummy url that will be replaced in the `check()` function. We append it with "probes"
# so the key is different than the rest of the kubelet scrapers.
'prometheus_url': instance.get('probes_metrics_endpoint', 'dummy_url/probes'),
}
)
return probes_instance

def prober_probe_total(self, metric, scraper_config):
for sample in metric.samples:
metric_name_suffix = ''
labels = sample[OpenMetricsBaseCheck.SAMPLE_LABELS]

probe_type = labels.get('probe_type')
if probe_type == 'Liveness':
metric_name_suffix = '.liveness_probe'
elif probe_type == 'Readiness':
metric_name_suffix = '.readiness_probe'
else:
self.log.debug("Unsupported probe type %s", probe_type)
continue

result = labels.get('result')
if result == 'successful':
metric_name_suffix = metric_name_suffix + '.success.total'
elif result == 'failed':
metric_name_suffix = metric_name_suffix + '.failure.total'
elif result == 'unknown':
metric_name_suffix = metric_name_suffix + '.unknown.total'
else:
self.log.debug("Unsupported probe result %s", result)
continue

metric_name = scraper_config['namespace'] + metric_name_suffix

container_id = self.pod_list_utils.get_cid_by_labels(labels)
if container_id is None:
self.log.debug(
"Container id not found from /pods for container: %s/%s/%s - no metrics will be sent",
get_container_label(labels, 'namespace'),
get_container_label(labels, 'pod'),
get_container_label(labels, 'container'),
)
continue

if self.pod_list_utils.is_excluded(container_id):
continue

container_tags = tags_for_docker(replace_container_rt_prefix(container_id), tagger.HIGH, True)
if not container_tags:
self.log.debug(
"Tags not found for container: %s/%s/%s:%s - no metrics will be sent",
get_container_label(labels, 'namespace'),
get_container_label(labels, 'pod'),
get_container_label(labels, 'container'),
container_id,
)

self.count(metric_name, sample[self.SAMPLE_VALUE], container_tags + self.instance_tags)
48 changes: 9 additions & 39 deletions kubelet/datadog_checks/kubelet/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from datadog_checks.base.checks.openmetrics import OpenMetricsBaseCheck
from datadog_checks.base.utils.tagging import tagger

from .common import get_pod_by_uid, is_static_pending_pod, replace_container_rt_prefix
from .common import get_container_label, get_pod_by_uid, is_static_pending_pod, replace_container_rt_prefix

METRIC_TYPES = ['counter', 'gauge', 'summary']

Expand Down Expand Up @@ -146,36 +146,6 @@ def _is_pod_metric(labels):
return True
return False

@staticmethod
def _get_container_label(labels, l_name):
"""
Iter on all labels to find the label.name equal to the l_name
:param labels: list of labels
:param l_name: str
:return: str or None
"""
if l_name in labels:
return labels[l_name]

def _get_container_id(self, labels):
"""
Should only be called on a container-scoped metric
It gets the container id from the podlist using the metrics labels
:param labels
:return str or None
"""
namespace = CadvisorPrometheusScraperMixin._get_container_label(labels, "namespace")
# k8s >= 1.16
pod_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "pod")
container_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "container")
# k8s < 1.16
if not pod_name:
pod_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "pod_name")
if not container_name:
container_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "container_name")
return self.pod_list_utils.get_cid_by_name_tuple((namespace, pod_name, container_name))

def _get_entity_id_if_container_metric(self, labels):
"""
Checks the labels indicate a container metric,
Expand All @@ -190,19 +160,19 @@ def _get_entity_id_if_container_metric(self, labels):
# If the pod is static, ContainerStatus is unavailable.
# Return the pod UID so that we can collect metrics from it later on.
return self._get_pod_uid(labels)
return self._get_container_id(labels)
return self.pod_list_utils.get_cid_by_labels(labels)

def _get_pod_uid(self, labels):
"""
Return the id of a pod
:param labels:
:return: str or None
"""
namespace = CadvisorPrometheusScraperMixin._get_container_label(labels, "namespace")
pod_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "pod")
namespace = get_container_label(labels, "namespace")
pod_name = get_container_label(labels, "pod")
# k8s < 1.16
if not pod_name:
pod_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "pod_name")
pod_name = get_container_label(labels, "pod_name")
return self.pod_list_utils.get_uid_by_name_tuple((namespace, pod_name))

def _get_pod_uid_if_pod_metric(self, labels):
Expand Down Expand Up @@ -246,10 +216,10 @@ def _get_kube_container_name(labels):
:param labels: metric labels: iterable
:return: list
"""
container_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "container")
container_name = get_container_label(labels, "container")
# k8s < 1.16
if not container_name:
container_name = CadvisorPrometheusScraperMixin._get_container_label(labels, "container_name")
container_name = get_container_label(labels, "container_name")
if container_name:
return ["kube_container_name:%s" % container_name]
return []
Expand Down Expand Up @@ -376,7 +346,7 @@ def _process_usage_metric(self, m_name, metric, cache, scraper_config, labels=No

samples = self._sum_values_by_context(metric, self._get_entity_id_if_container_metric)
for c_id, sample in iteritems(samples):
c_name = self._get_container_label(sample[self.SAMPLE_LABELS], 'name')
c_name = get_container_label(sample[self.SAMPLE_LABELS], 'name')
if not c_name:
continue
pod_uid = self._get_pod_uid(sample[self.SAMPLE_LABELS])
Expand Down Expand Up @@ -436,7 +406,7 @@ def _process_limit_metric(self, m_name, metric, cache, scraper_config, pct_m_nam
self.gauge(m_name, limit, tags)

if pct_m_name and limit > 0:
c_name = self._get_container_label(sample[self.SAMPLE_LABELS], 'name')
c_name = get_container_label(sample[self.SAMPLE_LABELS], 'name')
if not c_name:
continue
usage, tags = cache.get(c_name, (None, None))
Expand Down
4 changes: 4 additions & 0 deletions kubelet/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,7 @@ kubernetes.kubelet.docker.operations.duration.sum,gauge,,operation,,The sum of d
kubernetes.kubelet.docker.operations.duration.count,gauge,,,,The count of docker operations,0,kubernetes,k8s.docker.duration,
kubernetes.go_threads,gauge,,,,Number of OS threads created,0,kubernetes,k8s.go.threads,
kubernetes.go_goroutines,gauge,,,,Number of goroutines that currently exist,0,kubernetes,k8s.go.goroutines,
kubernetes.liveness_probe.success.total,count,,,,Cumulative number of successful liveness probe for a container (ALPHA in kubernetes v1.15),-1,kubernetes,k8s.liveness_probe.success.total,
kubernetes.liveness_probe.failure.total,count,,,,Cumulative number of failed liveness probe for a container (ALPHA in kubernetes v1.15),-1,kubernetes,k8s.liveness_probe.failure.total,
kubernetes.readiness_probe.success.total,count,,,,Cumulative number of successful readiness probe for a container (ALPHA in kubernetes v1.15),-1,kubernetes,k8s.liveness_probe.success.total,
kubernetes.readiness_probe.failure.total,count,,,,Cumulative number of failed readiness probe for a container (ALPHA in kubernetes v1.15),-1,kubernetes,k8s.liveness_probe.failure.total,
Loading