From 60299560aa1332c46526db89cd3f6e11680e0df0 Mon Sep 17 00:00:00 2001 From: sukun Date: Thu, 16 Mar 2023 17:11:51 +0530 Subject: [PATCH 01/10] autorelay: add metrics metrics added: relay finder status reservation request outcomes current reservations candidate circuit v2 support current candidates relay addresses updated num relay address scheduled work times --- config/config.go | 6 + dashboards/autorelay/autorelay.json | 852 +++++++++++++++++++++ p2p/host/autorelay/autorelay.go | 6 + p2p/host/autorelay/metrics.go | 294 +++++++ p2p/host/autorelay/metrics_noalloc_test.go | 48 ++ p2p/host/autorelay/options.go | 10 + p2p/host/autorelay/relay_finder.go | 70 +- 7 files changed, 1272 insertions(+), 14 deletions(-) create mode 100644 dashboards/autorelay/autorelay.json create mode 100644 p2p/host/autorelay/metrics.go create mode 100644 p2p/host/autorelay/metrics_noalloc_test.go diff --git a/config/config.go b/config/config.go index 70686f7634..a3bd86f27b 100644 --- a/config/config.go +++ b/config/config.go @@ -354,6 +354,12 @@ func (cfg *Config) NewNode() (host.Host, error) { h.Close() return nil, fmt.Errorf("cannot enable autorelay; relay is not enabled") } + if !cfg.DisableMetrics { + mt := autorelay.WithMetricsTracer( + autorelay.NewMetricsTracer(autorelay.WithRegisterer(cfg.PrometheusRegisterer))) + mtOpts := []autorelay.Option{mt} + cfg.AutoRelayOpts = append(mtOpts, cfg.AutoRelayOpts...) + } ar, err = autorelay.NewAutoRelay(h, cfg.AutoRelayOpts...) if err != nil { diff --git a/dashboards/autorelay/autorelay.json b/dashboards/autorelay/autorelay.json new file mode 100644 index 0000000000..4d4810777a --- /dev/null +++ b/dashboards/autorelay/autorelay.json @@ -0,0 +1,852 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.3.6" + }, + { + "type": "panel", + "id": "piechart", + "name": "Pie chart", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "state-timeline", + "name": "State timeline", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 24, + "panels": [], + "title": "Scheduled Work", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 16, + "panels": [], + "title": "Status", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "mappings": [ + { + "options": { + "0": { + "color": "purple", + "index": 1, + "text": "No" + }, + "1": { + "color": "green", + "index": 0, + "text": "Yes" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 2, + "options": { + "alignValue": "center", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "libp2p_autorelay_status", + "legendFormat": "active", + "range": true, + "refId": "A" + } + ], + "title": "Relay Finder Status", + "type": "state-timeline" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 18, + "panels": [], + "title": "Reservations", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "libp2p_autorelay_reservations_opened_total - libp2p_autorelay_reservations_closed_total", + "legendFormat": "current reservations", + "range": true, + "refId": "A" + } + ], + "title": "Current Reservations", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "refresh reservation: failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "new reservation: success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "refresh reservation: success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "new reservation: failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 18, + "x": 6, + "y": 8 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "increase(libp2p_autorelay_reservation_requests_outcome_total[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{request_type}} reservation: {{outcome}}", + "range": true, + "refId": "A" + } + ], + "title": "Reservation Requests Outcome", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 20, + "panels": [], + "title": "Candidates", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 2, + "y": 17 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "libp2p_autorelay_candidates_total{type=\"added\"} - ignoring(type) libp2p_autorelay_candidates_total{type=\"removed\"}", + "legendFormat": "num candidates", + "range": true, + "refId": "A" + } + ], + "title": "Current Candidates", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "yes" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "no" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 8, + "y": 17 + }, + "id": 12, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "increase(libp2p_autorelay_candidates_circuit_v2_support_total[$__range])", + "legendFormat": "{{support}}", + "range": true, + "refId": "A" + } + ], + "title": "Candidates Circuit V2 Support", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "dateTimeFromNow" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 14, + "y": 17 + }, + "id": 26, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "libp2p_autorelay_scheduled_work_time{work_type=\"old candidate check\"} * 1000\n", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Next Old Candidate Check", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 22, + "panels": [], + "title": "Relay Addresses", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "yellow", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 26 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "libp2p_autorelay_relay_addresses_count", + "legendFormat": "num addresses", + "range": true, + "refId": "A" + } + ], + "title": "Relay Addresses", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "update triggered" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 18, + "x": 6, + "y": 26 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "increase(libp2p_autorelay_relay_addresses_updated_total[$__rate_interval])", + "legendFormat": "update triggered", + "range": true, + "refId": "A" + } + ], + "title": "Relay Addresses Updated", + "type": "timeseries" + } + ], + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "libp2p Autorelay", + "uid": "deQ_uf-4k", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/p2p/host/autorelay/autorelay.go b/p2p/host/autorelay/autorelay.go index fc46a2cc53..580cfaa9a0 100644 --- a/p2p/host/autorelay/autorelay.go +++ b/p2p/host/autorelay/autorelay.go @@ -30,6 +30,8 @@ type AutoRelay struct { host host.Host addrsF basic.AddrsFactory + + metricsTracer MetricsTracer } func NewAutoRelay(bhost *basic.BasicHost, opts ...Option) (*AutoRelay, error) { @@ -47,6 +49,7 @@ func NewAutoRelay(bhost *basic.BasicHost, opts ...Option) (*AutoRelay, error) { r.ctx, r.ctxCancel = context.WithCancel(context.Background()) r.conf = &conf r.relayFinder = newRelayFinder(bhost, conf.peerSource, &conf) + r.metricsTracer = &wrappedMetricsTracer{conf.metricsTracer} bhost.AddrsFactory = r.hostAddrs return r, nil @@ -82,9 +85,12 @@ func (r *AutoRelay) background() { case network.ReachabilityPrivate, network.ReachabilityUnknown: if err := r.relayFinder.Start(); err != nil { log.Errorw("failed to start relay finder", "error", err) + } else { + r.metricsTracer.RelayFinderStatus(true) } case network.ReachabilityPublic: r.relayFinder.Stop() + r.metricsTracer.RelayFinderStatus(false) } r.mx.Lock() r.status = evt.Reachability diff --git a/p2p/host/autorelay/metrics.go b/p2p/host/autorelay/metrics.go new file mode 100644 index 0000000000..06b395af35 --- /dev/null +++ b/p2p/host/autorelay/metrics.go @@ -0,0 +1,294 @@ +package autorelay + +import ( + "github.com/libp2p/go-libp2p/p2p/metricshelper" + "github.com/prometheus/client_golang/prometheus" +) + +const metricNamespace = "libp2p_autorelay" + +var ( + status = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: metricNamespace, + Name: "status", + Help: "relay finder active", + }) + reservationsOpenedTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: "reservations_opened_total", + Help: "Reservations Opened", + }, + ) + reservationsClosedTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: "reservations_closed_total", + Help: "Reservations Closed", + }, + ) + reservationRequestsOutcomeTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: "reservation_requests_outcome_total", + Help: "Reservation Request Outcome", + }, + []string{"request_type", "outcome"}, + ) + + relayAddressesUpdatedTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: "relay_addresses_updated_total", + Help: "Relay Addresses Updated Count", + }, + ) + relayAddressesCount = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Name: "relay_addresses_count", + Help: "Relay Addresses Count", + }, + ) + + candidatesCircuitV2SupportTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: "candidates_circuit_v2_support_total", + Help: "Candidiates supporting circuit v2", + }, + []string{"support"}, + ) + candidatesTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: "candidates_total", + Help: "Candidates Total", + }, + []string{"type"}, + ) + + scheduledWorkTime = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Name: "scheduled_work_time", + Help: "Scheduled Work Times", + }, + []string{"work_type"}, + ) + + collectors = []prometheus.Collector{ + status, + reservationsOpenedTotal, + reservationsClosedTotal, + reservationRequestsOutcomeTotal, + relayAddressesUpdatedTotal, + relayAddressesCount, + candidatesCircuitV2SupportTotal, + candidatesTotal, + scheduledWorkTime, + } +) + +// MetricsTracer is the interface for tracking metrics for autorelay +type MetricsTracer interface { + RelayFinderStatus(isActive bool) + + ReservationEnded() + ReservationRequestFinished(isRefresh bool, success bool) + + RelayAddressCount(int) + RelayAddressUpdated() + + CandidateChecked(supportsCircuitV2 bool) + CandidateAdded() + CandidateRemoved() + + ScheduledWorkUpdated(scheduledWork *scheduledWorkTimes) +} + +type metricsTracer struct{} + +var _ MetricsTracer = &metricsTracer{} + +type metricsTracerSetting struct { + reg prometheus.Registerer +} + +type MetricsTracerOption func(*metricsTracerSetting) + +func WithRegisterer(reg prometheus.Registerer) MetricsTracerOption { + return func(s *metricsTracerSetting) { + if reg != nil { + s.reg = reg + } + } +} + +func NewMetricsTracer(opts ...MetricsTracerOption) MetricsTracer { + setting := &metricsTracerSetting{reg: prometheus.DefaultRegisterer} + for _, opt := range opts { + opt(setting) + } + metricshelper.RegisterCollectors(setting.reg, collectors...) + + // Initialise these counters to 0 otherwise the first reservation requests aren't handled + // correctly when using promql increse function + reservationRequestsOutcomeTotal.WithLabelValues("refresh", "success") + reservationRequestsOutcomeTotal.WithLabelValues("refresh", "failed") + reservationRequestsOutcomeTotal.WithLabelValues("new", "success") + reservationRequestsOutcomeTotal.WithLabelValues("new", "failed") + candidatesCircuitV2SupportTotal.WithLabelValues("yes") + candidatesCircuitV2SupportTotal.WithLabelValues("no") + return &metricsTracer{} +} + +func (mt *metricsTracer) RelayFinderStatus(isActive bool) { + if isActive { + status.Set(1) + } else { + status.Set(0) + } +} + +func (mt *metricsTracer) ReservationEnded() { + reservationsClosedTotal.Inc() +} + +func (mt *metricsTracer) ReservationRequestFinished(isRefresh bool, success bool) { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + + if isRefresh { + *tags = append(*tags, "refresh") + } else { + *tags = append(*tags, "new") + } + + if success { + *tags = append(*tags, "success") + } else { + *tags = append(*tags, "failed") + } + reservationRequestsOutcomeTotal.WithLabelValues(*tags...).Inc() + + if !isRefresh && success { + reservationsOpenedTotal.Inc() + } +} + +func (mt *metricsTracer) RelayAddressUpdated() { + relayAddressesUpdatedTotal.Inc() +} + +func (mt *metricsTracer) RelayAddressCount(cnt int) { + relayAddressesCount.Set(float64(cnt)) +} + +func (mt *metricsTracer) CandidateChecked(supportsCircuitV2 bool) { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + if supportsCircuitV2 { + *tags = append(*tags, "yes") + } else { + *tags = append(*tags, "no") + } + candidatesCircuitV2SupportTotal.WithLabelValues(*tags...).Inc() +} + +func (mt *metricsTracer) CandidateAdded() { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + *tags = append(*tags, "added") + candidatesTotal.WithLabelValues(*tags...).Inc() +} + +func (mt *metricsTracer) CandidateRemoved() { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + *tags = append(*tags, "removed") + candidatesTotal.WithLabelValues(*tags...).Inc() +} + +func (mt *metricsTracer) ScheduledWorkUpdated(scheduledWork *scheduledWorkTimes) { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + + *tags = append(*tags, "allowed peer source call") + scheduledWorkTime.WithLabelValues(*tags...).Set(float64(scheduledWork.nextAllowedCallToPeerSource.Unix())) + *tags = (*tags)[:0] + + *tags = append(*tags, "reservation refresh") + scheduledWorkTime.WithLabelValues(*tags...).Set(float64(scheduledWork.nextRefresh.Unix())) + *tags = (*tags)[:0] + + *tags = append(*tags, "clear backoff") + scheduledWorkTime.WithLabelValues(*tags...).Set(float64(scheduledWork.nextBackoff.Unix())) + *tags = (*tags)[:0] + + *tags = append(*tags, "old candidate check") + scheduledWorkTime.WithLabelValues(*tags...).Set(float64(scheduledWork.nextOldCandidateCheck.Unix())) +} + +// wrappedMetricsTracer wraps MetricsTracer and ignores all calls when mt is nil +type wrappedMetricsTracer struct { + mt MetricsTracer +} + +var _ MetricsTracer = &wrappedMetricsTracer{} + +func (mt *wrappedMetricsTracer) RelayFinderStatus(isActive bool) { + if isActive { + status.Set(1) + } else { + status.Set(2) + } +} + +func (mt *wrappedMetricsTracer) ReservationEnded() { + if mt.mt != nil { + mt.mt.ReservationEnded() + } +} + +func (mt *wrappedMetricsTracer) ReservationRequestFinished(isRefresh bool, success bool) { + if mt.mt != nil { + mt.mt.ReservationRequestFinished(isRefresh, success) + } +} + +func (mt *wrappedMetricsTracer) RelayAddressUpdated() { + if mt.mt != nil { + mt.mt.RelayAddressUpdated() + } +} + +func (mt *wrappedMetricsTracer) RelayAddressCount(cnt int) { + if mt.mt != nil { + mt.mt.RelayAddressCount(cnt) + } +} + +func (mt *wrappedMetricsTracer) CandidateChecked(supportsCircuitV2 bool) { + if mt.mt != nil { + mt.mt.CandidateChecked(supportsCircuitV2) + } +} + +func (mt *wrappedMetricsTracer) CandidateAdded() { + if mt.mt != nil { + mt.mt.CandidateAdded() + } +} + +func (mt *wrappedMetricsTracer) CandidateRemoved() { + if mt.mt != nil { + mt.mt.CandidateRemoved() + } +} + +func (mt *wrappedMetricsTracer) ScheduledWorkUpdated(scheduledWork *scheduledWorkTimes) { + if mt.mt != nil { + mt.mt.ScheduledWorkUpdated(scheduledWork) + } +} diff --git a/p2p/host/autorelay/metrics_noalloc_test.go b/p2p/host/autorelay/metrics_noalloc_test.go new file mode 100644 index 0000000000..c6abfafff0 --- /dev/null +++ b/p2p/host/autorelay/metrics_noalloc_test.go @@ -0,0 +1,48 @@ +//go:build nocover + +package autorelay + +import ( + "math/rand" + "testing" + "time" +) + +func getRandScheduledWork() scheduledWorkTimes { + randTime := func() time.Time { + return time.Now().Add(time.Duration(rand.Intn(10)) * time.Second) + } + return scheduledWorkTimes{ + leastFrequentInterval: 0, + nextRefresh: randTime(), + nextBackoff: randTime(), + nextOldCandidateCheck: randTime(), + nextAllowedCallToPeerSource: randTime(), + } +} + +func TestMetricsNoAllocNoCover(t *testing.T) { + scheduledWork := []scheduledWorkTimes{} + for i := 0; i < 10; i++ { + scheduledWork = append(scheduledWork, getRandScheduledWork()) + } + tr := NewMetricsTracer() + tests := map[string]func(){ + "RelayFinderStatus": func() { tr.RelayFinderStatus(rand.Intn(2) == 1) }, + "ReservationEnded": func() { tr.ReservationEnded() }, + "ReservationRequestFinished": func() { tr.ReservationRequestFinished(rand.Intn(2) == 1, rand.Intn(2) == 1) }, + "RelayAddressCount": func() { tr.RelayAddressCount(rand.Intn(10)) }, + "RelayAddressUpdated": func() { tr.RelayAddressUpdated() }, + "CandidateChecked": func() { tr.CandidateChecked(rand.Intn(2) == 1) }, + "CandidateAdded": func() { tr.CandidateAdded() }, + "CandidateRemoved": func() { tr.CandidateRemoved() }, + "ScheduledWorkUpdated": func() { tr.ScheduledWorkUpdated(&scheduledWork[rand.Intn(len(scheduledWork))]) }, + } + for method, f := range tests { + allocs := testing.AllocsPerRun(1000, f) + + if allocs > 0 { + t.Fatalf("Alloc Test: %s, got: %0.2f, expected: 0 allocs", method, allocs) + } + } +} diff --git a/p2p/host/autorelay/options.go b/p2p/host/autorelay/options.go index dbacf55e7f..26ba92018f 100644 --- a/p2p/host/autorelay/options.go +++ b/p2p/host/autorelay/options.go @@ -40,6 +40,8 @@ type config struct { // see WithMaxCandidateAge maxCandidateAge time.Duration setMinCandidates bool + // see WithMetricsTracer + metricsTracer MetricsTracer } var defaultConfig = config{ @@ -221,3 +223,11 @@ func WithMinInterval(interval time.Duration) Option { return nil } } + +// WithMetricsTracer configures autorelay to use mt to track metrics +func WithMetricsTracer(mt MetricsTracer) Option { + return func(c *config) error { + c.metricsTracer = mt + return nil + } +} diff --git a/p2p/host/autorelay/relay_finder.go b/p2p/host/autorelay/relay_finder.go index ec46d58213..da2960deed 100644 --- a/p2p/host/autorelay/relay_finder.go +++ b/p2p/host/autorelay/relay_finder.go @@ -80,6 +80,7 @@ type relayFinder struct { // A channel that triggers a run of `runScheduledWork`. triggerRunScheduledWork chan struct{} + metricsTracer MetricsTracer } func newRelayFinder(host *basic.BasicHost, peerSource PeerSource, conf *config) *relayFinder { @@ -100,6 +101,7 @@ func newRelayFinder(host *basic.BasicHost, peerSource PeerSource, conf *config) triggerRunScheduledWork: make(chan struct{}, 1), relays: make(map[peer.ID]*circuitv2.Reservation), relayUpdated: make(chan struct{}, 1), + metricsTracer: &wrappedMetricsTracer{conf.metricsTracer}, } } @@ -184,6 +186,7 @@ func (rf *relayFinder) background(ctx context.Context) { if push { rf.clearCachedAddrsAndSignalAddressChange() + rf.metricsTracer.ReservationEnded() } case <-rf.candidateFound: rf.notifyMaybeConnectToRelay() @@ -211,6 +214,8 @@ func (rf *relayFinder) clearCachedAddrsAndSignalAddressChange() { rf.cachedAddrs = nil rf.relayMx.Unlock() rf.host.SignalAddressChange() + + rf.metricsTracer.RelayAddressUpdated() } func (rf *relayFinder) runScheduledWork(ctx context.Context, now time.Time, scheduledWork *scheduledWorkTimes, peerSourceRateLimiter chan<- struct{}) time.Time { @@ -262,6 +267,8 @@ func (rf *relayFinder) runScheduledWork(ctx context.Context, now time.Time, sche nextTime = nextTime.Add(1) // avoids an infinite loop } + rf.metricsTracer.ScheduledWorkUpdated(scheduledWork) + return nextTime } @@ -281,10 +288,8 @@ func (rf *relayFinder) clearOldCandidates(now time.Time) time.Time { nextTime = expiry } } else { - deleted = true log.Debugw("deleting candidate due to age", "id", id) - delete(rf.candidates, id) - + rf.removeCandidate(id) } } if deleted { @@ -417,23 +422,30 @@ func (rf *relayFinder) handleNewNode(ctx context.Context, pi peer.AddrInfo) (add supportsV2, err := rf.tryNode(ctx, pi) if err != nil { log.Debugf("node %s not accepted as a candidate: %s", pi.ID, err) + if err == errProtocolNotSupported { + rf.metricsTracer.CandidateChecked(false) + } return false } + rf.metricsTracer.CandidateChecked(true) + rf.candidateMx.Lock() if len(rf.candidates) > rf.conf.maxCandidates { rf.candidateMx.Unlock() return false } log.Debugw("node supports relay protocol", "peer", pi.ID, "supports circuit v2", supportsV2) - rf.candidates[pi.ID] = &candidate{ + rf.addCandidate(&candidate{ added: rf.conf.clock.Now(), ai: pi, supportsRelayV2: supportsV2, - } + }) rf.candidateMx.Unlock() return true } +var errProtocolNotSupported = errors.New("doesn't speak circuit v2") + // tryNode checks if a peer actually supports either circuit v2. // It does not modify any internal state. func (rf *relayFinder) tryNode(ctx context.Context, pi peer.AddrInfo) (supportsRelayV2 bool, err error) { @@ -474,7 +486,7 @@ func (rf *relayFinder) tryNode(ctx context.Context, pi peer.AddrInfo) (supportsR return false, fmt.Errorf("error checking relay protocol support for peer %s: %w", pi.ID, err) } if len(protos) == 0 { - return false, errors.New("doesn't speak circuit v2") + return false, errProtocolNotSupported } return true, nil } @@ -526,7 +538,7 @@ func (rf *relayFinder) maybeConnectToRelay(ctx context.Context) { rf.relayMx.Unlock() if usingRelay { rf.candidateMx.Lock() - delete(rf.candidates, id) + rf.removeCandidate(id) rf.candidateMx.Unlock() rf.notifyMaybeNeedNewCandidates() continue @@ -535,6 +547,7 @@ func (rf *relayFinder) maybeConnectToRelay(ctx context.Context) { if err != nil { log.Debugw("failed to connect to relay", "peer", id, "error", err) rf.notifyMaybeNeedNewCandidates() + rf.metricsTracer.ReservationRequestFinished(false, false) continue } log.Debugw("adding new relay", "id", id) @@ -551,6 +564,8 @@ func (rf *relayFinder) maybeConnectToRelay(ctx context.Context) { default: } + rf.metricsTracer.ReservationRequestFinished(false, true) + if numRelays >= rf.conf.desiredRelays { break } @@ -569,7 +584,7 @@ func (rf *relayFinder) connectToRelay(ctx context.Context, cand *candidate) (*ci if rf.host.Network().Connectedness(id) != network.Connected { if err := rf.host.Connect(ctx, cand.ai); err != nil { rf.candidateMx.Lock() - delete(rf.candidates, cand.ai.ID) + rf.removeCandidate(cand.ai.ID) rf.candidateMx.Unlock() return nil, fmt.Errorf("failed to connect: %w", err) } @@ -586,7 +601,7 @@ func (rf *relayFinder) connectToRelay(ctx context.Context, cand *candidate) (*ci } } rf.candidateMx.Lock() - delete(rf.candidates, id) + rf.removeCandidate(id) rf.candidateMx.Unlock() return rsvp, err } @@ -602,7 +617,12 @@ func (rf *relayFinder) refreshReservations(ctx context.Context, now time.Time) b } p := p - g.Go(func() error { return rf.refreshRelayReservation(ctx, p) }) + g.Go(func() error { + err := rf.refreshRelayReservation(ctx, p) + rf.metricsTracer.ReservationRequestFinished(true, err != nil) + + return err + }) } rf.relayMx.Unlock() @@ -614,19 +634,22 @@ func (rf *relayFinder) refreshRelayReservation(ctx context.Context, p peer.ID) e rsvp, err := circuitv2.Reserve(ctx, rf.host, peer.AddrInfo{ID: p}) rf.relayMx.Lock() - defer rf.relayMx.Unlock() - if err != nil { log.Debugw("failed to refresh relay slot reservation", "relay", p, "error", err) - + _, exists := rf.relays[p] delete(rf.relays, p) // unprotect the connection rf.host.ConnManager().Unprotect(p, autorelayTag) + rf.relayMx.Unlock() + if exists { + rf.metricsTracer.ReservationEnded() + } return err } log.Debugw("refreshed relay slot reservation", "relay", p) rf.relays[p] = rsvp + rf.relayMx.Unlock() return nil } @@ -636,6 +659,23 @@ func (rf *relayFinder) usingRelay(p peer.ID) bool { return ok } +// addCandidates adds a candidate to the candidates set. Assumes caller holds candidateMx mutex +func (rf *relayFinder) addCandidate(cand *candidate) { + _, exists := rf.candidates[cand.ai.ID] + rf.candidates[cand.ai.ID] = cand + if !exists { + rf.metricsTracer.CandidateAdded() + } +} + +func (rf *relayFinder) removeCandidate(id peer.ID) { + _, exists := rf.candidates[id] + if exists { + delete(rf.candidates, id) + rf.metricsTracer.CandidateRemoved() + } +} + // selectCandidates returns an ordered slice of relay candidates. // Callers should attempt to obtain reservations with the candidates in this order. func (rf *relayFinder) selectCandidates() []*candidate { @@ -680,9 +720,10 @@ func (rf *relayFinder) relayAddrs(addrs []ma.Multiaddr) []ma.Multiaddr { } // add relay specific addrs to the list + relayAddrCnt := 0 for p := range rf.relays { addrs := cleanupAddressSet(rf.host.Peerstore().Addrs(p)) - + relayAddrCnt += len(addrs) circuit := ma.StringCast(fmt.Sprintf("/p2p/%s/p2p-circuit", p.Pretty())) for _, addr := range addrs { pub := addr.Encapsulate(circuit) @@ -693,6 +734,7 @@ func (rf *relayFinder) relayAddrs(addrs []ma.Multiaddr) []ma.Multiaddr { rf.cachedAddrs = raddrs rf.cachedAddrsExpiry = rf.conf.clock.Now().Add(30 * time.Second) + rf.metricsTracer.RelayAddressCount(relayAddrCnt) return raddrs } From ca27e0c11957e6e3961153ee834c5c2c95ccc841 Mon Sep 17 00:00:00 2001 From: sukun Date: Tue, 21 Mar 2023 12:39:36 +0530 Subject: [PATCH 02/10] autorelay: fix refresh reservations bug --- dashboards/autorelay/autorelay.json | 28 ++++++++++++++++++++++------ p2p/host/autorelay/metrics.go | 27 +++++++++++++++++++++++---- p2p/host/autorelay/relay_finder.go | 4 +++- 3 files changed, 48 insertions(+), 11 deletions(-) diff --git a/dashboards/autorelay/autorelay.json b/dashboards/autorelay/autorelay.json index 4d4810777a..32b582430d 100644 --- a/dashboards/autorelay/autorelay.json +++ b/dashboards/autorelay/autorelay.json @@ -11,6 +11,12 @@ ], "__elements": {}, "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, { "type": "grafana", "id": "grafana", @@ -230,9 +236,6 @@ }, "id": 4, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ @@ -241,7 +244,8 @@ "fields": "", "values": false }, - "textMode": "auto" + "showThresholdLabels": false, + "showThresholdMarkers": true }, "pluginVersion": "9.3.6", "targets": [ @@ -255,10 +259,22 @@ "legendFormat": "current reservations", "range": true, "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "libp2p_autorelay_desired_reservations", + "hide": true, + "legendFormat": "__auto", + "range": true, + "refId": "B" } ], "title": "Current Reservations", - "type": "stat" + "type": "gauge" }, { "datasource": { @@ -840,7 +856,7 @@ "list": [] }, "time": { - "from": "now-1h", + "from": "now-5m", "to": "now" }, "timepicker": {}, diff --git a/p2p/host/autorelay/metrics.go b/p2p/host/autorelay/metrics.go index 06b395af35..a83b2df4b5 100644 --- a/p2p/host/autorelay/metrics.go +++ b/p2p/host/autorelay/metrics.go @@ -77,6 +77,14 @@ var ( []string{"work_type"}, ) + desiredReservations = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Name: "desired_reservations", + Help: "Desired Reservations", + }, + ) + collectors = []prometheus.Collector{ status, reservationsOpenedTotal, @@ -87,6 +95,7 @@ var ( candidatesCircuitV2SupportTotal, candidatesTotal, scheduledWorkTime, + desiredReservations, } ) @@ -105,6 +114,8 @@ type MetricsTracer interface { CandidateRemoved() ScheduledWorkUpdated(scheduledWork *scheduledWorkTimes) + + DesiredReservations(int) } type metricsTracer struct{} @@ -230,6 +241,10 @@ func (mt *metricsTracer) ScheduledWorkUpdated(scheduledWork *scheduledWorkTimes) scheduledWorkTime.WithLabelValues(*tags...).Set(float64(scheduledWork.nextOldCandidateCheck.Unix())) } +func (mt *metricsTracer) DesiredReservations(cnt int) { + desiredReservations.Set(float64(cnt)) +} + // wrappedMetricsTracer wraps MetricsTracer and ignores all calls when mt is nil type wrappedMetricsTracer struct { mt MetricsTracer @@ -238,10 +253,8 @@ type wrappedMetricsTracer struct { var _ MetricsTracer = &wrappedMetricsTracer{} func (mt *wrappedMetricsTracer) RelayFinderStatus(isActive bool) { - if isActive { - status.Set(1) - } else { - status.Set(2) + if mt.mt != nil { + mt.mt.RelayFinderStatus(isActive) } } @@ -292,3 +305,9 @@ func (mt *wrappedMetricsTracer) ScheduledWorkUpdated(scheduledWork *scheduledWor mt.mt.ScheduledWorkUpdated(scheduledWork) } } + +func (mt *wrappedMetricsTracer) DesiredReservations(cnt int) { + if mt.mt != nil { + mt.mt.DesiredReservations(cnt) + } +} diff --git a/p2p/host/autorelay/relay_finder.go b/p2p/host/autorelay/relay_finder.go index da2960deed..1b69966f85 100644 --- a/p2p/host/autorelay/relay_finder.go +++ b/p2p/host/autorelay/relay_finder.go @@ -138,6 +138,8 @@ func (rf *relayFinder) background(ctx context.Context) { bootDelayTimer := rf.conf.clock.InstantTimer(now.Add(rf.conf.bootDelay)) defer bootDelayTimer.Stop() + rf.metricsTracer.DesiredReservations(rf.conf.desiredRelays) + // This is the least frequent event. It's our fallback timer if we don't have any other work to do. leastFrequentInterval := rf.conf.minInterval // Check if leastFrequentInterval is 0 to avoid busy looping @@ -619,7 +621,7 @@ func (rf *relayFinder) refreshReservations(ctx context.Context, now time.Time) b p := p g.Go(func() error { err := rf.refreshRelayReservation(ctx, p) - rf.metricsTracer.ReservationRequestFinished(true, err != nil) + rf.metricsTracer.ReservationRequestFinished(true, err == nil) return err }) From 62914d9fdd16dd0018ce4b6c690e64f32b962dcd Mon Sep 17 00:00:00 2001 From: sukun Date: Tue, 21 Mar 2023 15:45:38 +0530 Subject: [PATCH 03/10] fix max value hack --- dashboards/autorelay/autorelay.json | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/dashboards/autorelay/autorelay.json b/dashboards/autorelay/autorelay.json index 32b582430d..dd7ee06ab1 100644 --- a/dashboards/autorelay/autorelay.json +++ b/dashboards/autorelay/autorelay.json @@ -267,13 +267,32 @@ }, "editorMode": "code", "expr": "libp2p_autorelay_desired_reservations", - "hide": true, + "hide": false, "legendFormat": "__auto", "range": true, - "refId": "B" + "refId": "config-query" } ], "title": "Current Reservations", + "transformations": [ + { + "id": "configFromData", + "options": { + "applyTo": { + "id": "byFrameRefID", + "options": "A" + }, + "configRefId": "config-query", + "mappings": [ + { + "fieldName": "{__name__=\"libp2p_autorelay_desired_reservations\", instance=\"localhost:5001\", job=\"kubo\"}", + "handlerKey": "max", + "reducerId": "lastNotNull" + } + ] + } + } + ], "type": "gauge" }, { @@ -856,13 +875,13 @@ "list": [] }, "time": { - "from": "now-5m", + "from": "now-1h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "libp2p Autorelay", "uid": "deQ_uf-4k", - "version": 2, + "version": 5, "weekStart": "" } \ No newline at end of file From 4fce1260d767eb143abf4b11c6dc723c4d4de5e8 Mon Sep 17 00:00:00 2001 From: Sukun Date: Tue, 21 Mar 2023 22:21:36 +0530 Subject: [PATCH 04/10] improve tracking errors in reservation requests --- dashboards/autorelay/autorelay.json | 56 +++++-------------- p2p/host/autorelay/metrics.go | 47 +++++++++++----- p2p/host/autorelay/metrics_noalloc_test.go | 10 +++- p2p/host/autorelay/relay_finder.go | 6 +- p2p/protocol/circuitv2/client/reservation.go | 52 ++++++++++++++--- .../circuitv2/client/reservation_test.go | 23 ++++++-- 6 files changed, 122 insertions(+), 72 deletions(-) diff --git a/dashboards/autorelay/autorelay.json b/dashboards/autorelay/autorelay.json index dd7ee06ab1..e7a981381c 100644 --- a/dashboards/autorelay/autorelay.json +++ b/dashboards/autorelay/autorelay.json @@ -21,7 +21,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "9.3.6" + "version": "9.4.1" }, { "type": "panel", @@ -247,7 +247,7 @@ "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "9.3.6", + "pluginVersion": "9.4.1", "targets": [ { "datasource": { @@ -350,22 +350,7 @@ { "matcher": { "id": "byName", - "options": "refresh reservation: failed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "new reservation: success" + "options": "new: success" }, "properties": [ { @@ -380,28 +365,13 @@ { "matcher": { "id": "byName", - "options": "refresh reservation: success" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "new reservation: failed" + "options": "refresh: success" }, "properties": [ { "id": "color", "value": { - "fixedColor": "orange", + "fixedColor": "super-light-green", "mode": "fixed" } } @@ -440,7 +410,7 @@ "format": "time_series", "instant": false, "interval": "", - "legendFormat": "{{request_type}} reservation: {{outcome}}", + "legendFormat": "{{request_type}}: {{outcome}}", "range": true, "refId": "A" } @@ -506,7 +476,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.3.6", + "pluginVersion": "9.4.1", "targets": [ { "datasource": { @@ -667,7 +637,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.3.6", + "pluginVersion": "9.4.1", "targets": [ { "datasource": { @@ -742,7 +712,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.3.6", + "pluginVersion": "9.4.1", "targets": [ { "datasource": { @@ -868,20 +838,22 @@ "type": "timeseries" } ], - "schemaVersion": 37, + "refresh": "", + "revision": 1, + "schemaVersion": 38, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { - "from": "now-1h", + "from": "now-5m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "libp2p Autorelay", "uid": "deQ_uf-4k", - "version": 5, + "version": 3, "weekStart": "" } \ No newline at end of file diff --git a/p2p/host/autorelay/metrics.go b/p2p/host/autorelay/metrics.go index a83b2df4b5..8a56c705c1 100644 --- a/p2p/host/autorelay/metrics.go +++ b/p2p/host/autorelay/metrics.go @@ -1,7 +1,11 @@ package autorelay import ( + "errors" + "github.com/libp2p/go-libp2p/p2p/metricshelper" + "github.com/libp2p/go-libp2p/p2p/protocol/circuitv2/client" + pbv2 "github.com/libp2p/go-libp2p/p2p/protocol/circuitv2/pb" "github.com/prometheus/client_golang/prometheus" ) @@ -104,7 +108,7 @@ type MetricsTracer interface { RelayFinderStatus(isActive bool) ReservationEnded() - ReservationRequestFinished(isRefresh bool, success bool) + ReservationRequestFinished(isRefresh bool, err error) RelayAddressCount(int) RelayAddressUpdated() @@ -146,9 +150,7 @@ func NewMetricsTracer(opts ...MetricsTracerOption) MetricsTracer { // Initialise these counters to 0 otherwise the first reservation requests aren't handled // correctly when using promql increse function reservationRequestsOutcomeTotal.WithLabelValues("refresh", "success") - reservationRequestsOutcomeTotal.WithLabelValues("refresh", "failed") reservationRequestsOutcomeTotal.WithLabelValues("new", "success") - reservationRequestsOutcomeTotal.WithLabelValues("new", "failed") candidatesCircuitV2SupportTotal.WithLabelValues("yes") candidatesCircuitV2SupportTotal.WithLabelValues("no") return &metricsTracer{} @@ -166,7 +168,7 @@ func (mt *metricsTracer) ReservationEnded() { reservationsClosedTotal.Inc() } -func (mt *metricsTracer) ReservationRequestFinished(isRefresh bool, success bool) { +func (mt *metricsTracer) ReservationRequestFinished(isRefresh bool, err error) { tags := metricshelper.GetStringSlice() defer metricshelper.PutStringSlice(tags) @@ -175,15 +177,10 @@ func (mt *metricsTracer) ReservationRequestFinished(isRefresh bool, success bool } else { *tags = append(*tags, "new") } - - if success { - *tags = append(*tags, "success") - } else { - *tags = append(*tags, "failed") - } + *tags = append(*tags, getReservationRequestStatus(err)) reservationRequestsOutcomeTotal.WithLabelValues(*tags...).Inc() - if !isRefresh && success { + if !isRefresh && err == nil { reservationsOpenedTotal.Inc() } } @@ -245,6 +242,30 @@ func (mt *metricsTracer) DesiredReservations(cnt int) { desiredReservations.Set(float64(cnt)) } +func getReservationRequestStatus(err error) string { + if err == nil { + return "success" + } + + status := "err other" + var re client.ReservationError + if errors.As(err, &re) { + switch re.Status { + case pbv2.Status_CONNECTION_FAILED: + return "connection failed" + case pbv2.Status_MALFORMED_MESSAGE: + return "malformed message" + case pbv2.Status_RESERVATION_REFUSED: + return "reservation refused" + case pbv2.Status_PERMISSION_DENIED: + return "permission denied" + case pbv2.Status_RESOURCE_LIMIT_EXCEEDED: + return "resource limit exceeded" + } + } + return status +} + // wrappedMetricsTracer wraps MetricsTracer and ignores all calls when mt is nil type wrappedMetricsTracer struct { mt MetricsTracer @@ -264,9 +285,9 @@ func (mt *wrappedMetricsTracer) ReservationEnded() { } } -func (mt *wrappedMetricsTracer) ReservationRequestFinished(isRefresh bool, success bool) { +func (mt *wrappedMetricsTracer) ReservationRequestFinished(isRefresh bool, err error) { if mt.mt != nil { - mt.mt.ReservationRequestFinished(isRefresh, success) + mt.mt.ReservationRequestFinished(isRefresh, err) } } diff --git a/p2p/host/autorelay/metrics_noalloc_test.go b/p2p/host/autorelay/metrics_noalloc_test.go index c6abfafff0..de3492094e 100644 --- a/p2p/host/autorelay/metrics_noalloc_test.go +++ b/p2p/host/autorelay/metrics_noalloc_test.go @@ -6,6 +6,9 @@ import ( "math/rand" "testing" "time" + + "github.com/libp2p/go-libp2p/p2p/protocol/circuitv2/client" + pbv2 "github.com/libp2p/go-libp2p/p2p/protocol/circuitv2/pb" ) func getRandScheduledWork() scheduledWorkTimes { @@ -26,11 +29,16 @@ func TestMetricsNoAllocNoCover(t *testing.T) { for i := 0; i < 10; i++ { scheduledWork = append(scheduledWork, getRandScheduledWork()) } + errs := []error{ + client.ReservationError{Status: pbv2.Status_MALFORMED_MESSAGE}, + client.ReservationError{Status: pbv2.Status_MALFORMED_MESSAGE}, + nil, + } tr := NewMetricsTracer() tests := map[string]func(){ "RelayFinderStatus": func() { tr.RelayFinderStatus(rand.Intn(2) == 1) }, "ReservationEnded": func() { tr.ReservationEnded() }, - "ReservationRequestFinished": func() { tr.ReservationRequestFinished(rand.Intn(2) == 1, rand.Intn(2) == 1) }, + "ReservationRequestFinished": func() { tr.ReservationRequestFinished(rand.Intn(2) == 1, errs[rand.Intn(len(errs))]) }, "RelayAddressCount": func() { tr.RelayAddressCount(rand.Intn(10)) }, "RelayAddressUpdated": func() { tr.RelayAddressUpdated() }, "CandidateChecked": func() { tr.CandidateChecked(rand.Intn(2) == 1) }, diff --git a/p2p/host/autorelay/relay_finder.go b/p2p/host/autorelay/relay_finder.go index 1b69966f85..727ae4bc60 100644 --- a/p2p/host/autorelay/relay_finder.go +++ b/p2p/host/autorelay/relay_finder.go @@ -549,7 +549,7 @@ func (rf *relayFinder) maybeConnectToRelay(ctx context.Context) { if err != nil { log.Debugw("failed to connect to relay", "peer", id, "error", err) rf.notifyMaybeNeedNewCandidates() - rf.metricsTracer.ReservationRequestFinished(false, false) + rf.metricsTracer.ReservationRequestFinished(false, err) continue } log.Debugw("adding new relay", "id", id) @@ -566,7 +566,7 @@ func (rf *relayFinder) maybeConnectToRelay(ctx context.Context) { default: } - rf.metricsTracer.ReservationRequestFinished(false, true) + rf.metricsTracer.ReservationRequestFinished(false, nil) if numRelays >= rf.conf.desiredRelays { break @@ -621,7 +621,7 @@ func (rf *relayFinder) refreshReservations(ctx context.Context, now time.Time) b p := p g.Go(func() error { err := rf.refreshRelayReservation(ctx, p) - rf.metricsTracer.ReservationRequestFinished(true, err == nil) + rf.metricsTracer.ReservationRequestFinished(true, err) return err }) diff --git a/p2p/protocol/circuitv2/client/reservation.go b/p2p/protocol/circuitv2/client/reservation.go index 1cd451ad13..dbb9241937 100644 --- a/p2p/protocol/circuitv2/client/reservation.go +++ b/p2p/protocol/circuitv2/client/reservation.go @@ -37,6 +37,27 @@ type Reservation struct { Voucher *proto.ReservationVoucher } +// ReservationError is the error returned on failure to reserve a slot in the relay +type ReservationError struct { + + // Status is the status returned by the relay for rejecting the reservation + // request. It is set to pbv2.Status_CONNECTION_FAILED on other failures + Status pbv2.Status + + // Reason is the reason for reservation failure + Reason string + + err error +} + +func (re ReservationError) Error() string { + return fmt.Sprintf("reservation error: status: %s reason: %s err: %s", pbv2.Status_name[int32(re.Status)], re.Reason, re.err) +} + +func (re ReservationError) Unwrap() error { + return re.err +} + // Reserve reserves a slot in a relay and returns the reservation information. // Clients must reserve slots in order for the relay to relay connections to them. func Reserve(ctx context.Context, h host.Host, ai peer.AddrInfo) (*Reservation, error) { @@ -46,7 +67,7 @@ func Reserve(ctx context.Context, h host.Host, ai peer.AddrInfo) (*Reservation, s, err := h.NewStream(ctx, ai.ID, proto.ProtoIDv2Hop) if err != nil { - return nil, err + return nil, ReservationError{Status: pbv2.Status_CONNECTION_FAILED, Reason: "failed to open stream", err: err} } defer s.Close() @@ -61,33 +82,39 @@ func Reserve(ctx context.Context, h host.Host, ai peer.AddrInfo) (*Reservation, if err := wr.WriteMsg(&msg); err != nil { s.Reset() - return nil, fmt.Errorf("error writing reservation message: %w", err) + return nil, ReservationError{Status: pbv2.Status_CONNECTION_FAILED, Reason: "error writing reservation message", err: err} } msg.Reset() if err := rd.ReadMsg(&msg); err != nil { s.Reset() - return nil, fmt.Errorf("error reading reservation response message: %w", err) + return nil, ReservationError{Status: pbv2.Status_CONNECTION_FAILED, Reason: "error reading reservation response message: %w", err: err} } if msg.GetType() != pbv2.HopMessage_STATUS { - return nil, fmt.Errorf("unexpected relay response: not a status message (%d)", msg.GetType()) + return nil, ReservationError{ + Status: pbv2.Status_MALFORMED_MESSAGE, + Reason: fmt.Sprintf("unexpected relay response: not a status message (%d)", msg.GetType()), + err: err} } if status := msg.GetStatus(); status != pbv2.Status_OK { - return nil, fmt.Errorf("reservation failed: %s (%d)", pbv2.Status_name[int32(status)], status) + return nil, ReservationError{Status: msg.GetStatus(), Reason: "reservation failed"} } rsvp := msg.GetReservation() if rsvp == nil { - return nil, fmt.Errorf("missing reservation info") + return nil, ReservationError{Status: pbv2.Status_MALFORMED_MESSAGE, Reason: "missing reservation info"} } result := &Reservation{} result.Expiration = time.Unix(int64(rsvp.GetExpire()), 0) if result.Expiration.Before(time.Now()) { - return nil, fmt.Errorf("received reservation with expiration date in the past: %s", result.Expiration) + return nil, ReservationError{ + Status: pbv2.Status_MALFORMED_MESSAGE, + Reason: fmt.Sprintf("received reservation with expiration date in the past: %s", result.Expiration), + } } addrs := rsvp.GetAddrs() @@ -105,12 +132,19 @@ func Reserve(ctx context.Context, h host.Host, ai peer.AddrInfo) (*Reservation, if voucherBytes != nil { _, rec, err := record.ConsumeEnvelope(voucherBytes, proto.RecordDomain) if err != nil { - return nil, fmt.Errorf("error consuming voucher envelope: %w", err) + return nil, ReservationError{ + Status: pbv2.Status_MALFORMED_MESSAGE, + Reason: fmt.Sprintf("error consuming voucher envelope: %s", err), + err: err, + } } voucher, ok := rec.(*proto.ReservationVoucher) if !ok { - return nil, fmt.Errorf("unexpected voucher record type: %+T", rec) + return nil, ReservationError{ + Status: pbv2.Status_MALFORMED_MESSAGE, + Reason: fmt.Sprintf("unexpected voucher record type: %+T", rec), + } } result.Voucher = voucher } diff --git a/p2p/protocol/circuitv2/client/reservation_test.go b/p2p/protocol/circuitv2/client/reservation_test.go index 31e2865448..decb3e71de 100644 --- a/p2p/protocol/circuitv2/client/reservation_test.go +++ b/p2p/protocol/circuitv2/client/reservation_test.go @@ -2,6 +2,7 @@ package client_test import ( "context" + "errors" "math" "testing" "time" @@ -22,6 +23,7 @@ func TestReservationFailures(t *testing.T) { name string streamHandler network.StreamHandler err string + status pbv2.Status } testcases := []testcase{ { @@ -36,7 +38,8 @@ func TestReservationFailures(t *testing.T) { Type: pbv2.HopMessage_RESERVE.Enum(), }) }, - err: "unexpected relay response: not a status message", + err: "unexpected relay response: not a status message", + status: pbv2.Status_MALFORMED_MESSAGE, }, { name: "unknown status", @@ -47,7 +50,8 @@ func TestReservationFailures(t *testing.T) { Status: &status, }) }, - err: "reservation failed", + err: "reservation failed", + status: pbv2.Status(1337), }, { name: "invalid time", @@ -60,7 +64,8 @@ func TestReservationFailures(t *testing.T) { Reservation: &pbv2.Reservation{Expire: &expire}, }) }, - err: "received reservation with expiration date in the past", + err: "received reservation with expiration date in the past", + status: pbv2.Status_MALFORMED_MESSAGE, }, { name: "invalid voucher", @@ -76,7 +81,8 @@ func TestReservationFailures(t *testing.T) { }, }) }, - err: "error consuming voucher envelope: failed when unmarshalling the envelope", + err: "error consuming voucher envelope: failed when unmarshalling the envelope", + status: pbv2.Status_MALFORMED_MESSAGE, }, } @@ -98,6 +104,15 @@ func TestReservationFailures(t *testing.T) { } else { require.Error(t, err) require.Contains(t, err.Error(), tc.err) + if tc.status != 0 { + var re client.ReservationError + if !errors.As(err, &re) { + t.Errorf("expected error to be of type %T", re) + } + if re.Status != tc.status { + t.Errorf("expected status %d got %d", tc.status, re.Status) + } + } } }) } From 9f17ffb651372da0b99a9e8746f31b24c76aba91 Mon Sep 17 00:00:00 2001 From: Sukun Date: Wed, 22 Mar 2023 15:15:08 +0530 Subject: [PATCH 05/10] fix config-query in grafana --- dashboards/autorelay/autorelay.json | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/dashboards/autorelay/autorelay.json b/dashboards/autorelay/autorelay.json index e7a981381c..3a20f99e54 100644 --- a/dashboards/autorelay/autorelay.json +++ b/dashboards/autorelay/autorelay.json @@ -216,6 +216,7 @@ "mode": "thresholds" }, "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -224,7 +225,8 @@ "value": null } ] - } + }, + "unit": "none" }, "overrides": [] }, @@ -255,9 +257,11 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", + "exemplar": false, "expr": "libp2p_autorelay_reservations_opened_total - libp2p_autorelay_reservations_closed_total", + "instant": true, "legendFormat": "current reservations", - "range": true, + "range": false, "refId": "A" }, { @@ -266,10 +270,13 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", + "exemplar": false, "expr": "libp2p_autorelay_desired_reservations", + "format": "time_series", "hide": false, - "legendFormat": "__auto", - "range": true, + "instant": true, + "legendFormat": "desired", + "range": false, "refId": "config-query" } ], @@ -285,9 +292,12 @@ "configRefId": "config-query", "mappings": [ { - "fieldName": "{__name__=\"libp2p_autorelay_desired_reservations\", instance=\"localhost:5001\", job=\"kubo\"}", - "handlerKey": "max", - "reducerId": "lastNotNull" + "fieldName": "desired", + "handlerKey": "max" + }, + { + "fieldName": "Time", + "handlerKey": "__ignore" } ] } @@ -847,7 +857,7 @@ "list": [] }, "time": { - "from": "now-5m", + "from": "now-1m", "to": "now" }, "timepicker": {}, From bd3f1f692ab781c86bdb875489565d9207cd5c5a Mon Sep 17 00:00:00 2001 From: Sukun Date: Wed, 22 Mar 2023 22:15:11 +0530 Subject: [PATCH 06/10] add candidate loop state panel --- dashboards/autorelay/autorelay.json | 220 ++++++++++++++++++--- p2p/host/autorelay/metrics.go | 28 +++ p2p/host/autorelay/metrics_noalloc_test.go | 2 + p2p/host/autorelay/relay_finder.go | 10 + 4 files changed, 229 insertions(+), 31 deletions(-) diff --git a/dashboards/autorelay/autorelay.json b/dashboards/autorelay/autorelay.json index 3a20f99e54..77be144342 100644 --- a/dashboards/autorelay/autorelay.json +++ b/dashboards/autorelay/autorelay.json @@ -91,19 +91,6 @@ "x": 0, "y": 0 }, - "id": 24, - "panels": [], - "title": "Scheduled Work", - "type": "row" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 1 - }, "id": 16, "panels": [], "title": "Status", @@ -157,7 +144,7 @@ "h": 5, "w": 24, "x": 0, - "y": 2 + "y": 1 }, "id": 2, "options": { @@ -197,7 +184,7 @@ "h": 1, "w": 24, "x": 0, - "y": 7 + "y": 6 }, "id": 18, "panels": [], @@ -234,7 +221,7 @@ "h": 8, "w": 6, "x": 0, - "y": 8 + "y": 7 }, "id": 4, "options": { @@ -393,7 +380,7 @@ "h": 8, "w": 18, "x": 6, - "y": 8 + "y": 7 }, "id": 10, "options": { @@ -434,13 +421,107 @@ "h": 1, "w": 24, "x": 0, - "y": 16 + "y": 15 }, "id": 20, "panels": [], "title": "Candidates", "type": "row" }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "purple", + "index": 0, + "text": "peer source rate limited" + }, + "1": { + "color": "blue", + "index": 1, + "text": "waiting on peer chan" + }, + "2": { + "color": "green", + "index": 2, + "text": "waiting for trigger" + }, + "3": { + "color": "light-yellow", + "index": 3, + "text": "stopped" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 28, + "options": { + "alignValue": "center", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "libp2p_autorelay_candidate_loop_state", + "instant": false, + "legendFormat": "state", + "range": true, + "refId": "A" + } + ], + "title": "Candidate Loop State", + "type": "state-timeline" + }, { "datasource": { "type": "prometheus", @@ -467,9 +548,9 @@ }, "gridPos": { "h": 8, - "w": 6, - "x": 2, - "y": 17 + "w": 5, + "x": 1, + "y": 22 }, "id": 6, "options": { @@ -557,9 +638,9 @@ }, "gridPos": { "h": 8, - "w": 6, - "x": 8, - "y": 17 + "w": 5, + "x": 6, + "y": 22 }, "id": 12, "options": { @@ -629,8 +710,8 @@ "gridPos": { "h": 8, "w": 6, - "x": 14, - "y": 17 + "x": 11, + "y": 22 }, "id": 26, "options": { @@ -664,13 +745,90 @@ "title": "Next Old Candidate Check", "type": "stat" }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "result": { + "color": "blue", + "index": 1, + "text": "immediately" + }, + "to": 0 + }, + "type": "range" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 17, + "y": 22 + }, + "id": 30, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "round(libp2p_autorelay_scheduled_work_time{work_type=\"allowed peer source call\"} - time()) ", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Next Allowed Call to peer source", + "type": "stat" + }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 25 + "y": 30 }, "id": 22, "panels": [], @@ -705,7 +863,7 @@ "h": 8, "w": 6, "x": 0, - "y": 26 + "y": 31 }, "id": 8, "options": { @@ -816,7 +974,7 @@ "h": 8, "w": 18, "x": 6, - "y": 26 + "y": 31 }, "id": 14, "options": { @@ -857,13 +1015,13 @@ "list": [] }, "time": { - "from": "now-1m", + "from": "now-1h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "libp2p Autorelay", "uid": "deQ_uf-4k", - "version": 3, + "version": 9, "weekStart": "" } \ No newline at end of file diff --git a/p2p/host/autorelay/metrics.go b/p2p/host/autorelay/metrics.go index 8a56c705c1..52da331884 100644 --- a/p2p/host/autorelay/metrics.go +++ b/p2p/host/autorelay/metrics.go @@ -71,6 +71,13 @@ var ( }, []string{"type"}, ) + candLoopState = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Name: "candidate_loop_state", + Help: "Candidate Loop State", + }, + ) scheduledWorkTime = prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -98,11 +105,21 @@ var ( relayAddressesCount, candidatesCircuitV2SupportTotal, candidatesTotal, + candLoopState, scheduledWorkTime, desiredReservations, } ) +type candidateLoopState int + +const ( + peerSourceRateLimited candidateLoopState = iota + waitingOnPeerChan + waitingForTrigger + stopped +) + // MetricsTracer is the interface for tracking metrics for autorelay type MetricsTracer interface { RelayFinderStatus(isActive bool) @@ -116,6 +133,7 @@ type MetricsTracer interface { CandidateChecked(supportsCircuitV2 bool) CandidateAdded() CandidateRemoved() + CandidateLoopState(state candidateLoopState) ScheduledWorkUpdated(scheduledWork *scheduledWorkTimes) @@ -218,6 +236,10 @@ func (mt *metricsTracer) CandidateRemoved() { candidatesTotal.WithLabelValues(*tags...).Inc() } +func (mt *metricsTracer) CandidateLoopState(state candidateLoopState) { + candLoopState.Set(float64(state)) +} + func (mt *metricsTracer) ScheduledWorkUpdated(scheduledWork *scheduledWorkTimes) { tags := metricshelper.GetStringSlice() defer metricshelper.PutStringSlice(tags) @@ -332,3 +354,9 @@ func (mt *wrappedMetricsTracer) DesiredReservations(cnt int) { mt.mt.DesiredReservations(cnt) } } + +func (mt *wrappedMetricsTracer) CandidateLoopState(state candidateLoopState) { + if mt.mt != nil { + mt.mt.CandidateLoopState(state) + } +} diff --git a/p2p/host/autorelay/metrics_noalloc_test.go b/p2p/host/autorelay/metrics_noalloc_test.go index de3492094e..2a8a7d5461 100644 --- a/p2p/host/autorelay/metrics_noalloc_test.go +++ b/p2p/host/autorelay/metrics_noalloc_test.go @@ -45,6 +45,8 @@ func TestMetricsNoAllocNoCover(t *testing.T) { "CandidateAdded": func() { tr.CandidateAdded() }, "CandidateRemoved": func() { tr.CandidateRemoved() }, "ScheduledWorkUpdated": func() { tr.ScheduledWorkUpdated(&scheduledWork[rand.Intn(len(scheduledWork))]) }, + "DesiredReservations": func() { tr.DesiredReservations(rand.Intn(10)) }, + "CandidateLoopState": func() { tr.CandidateLoopState(candidateLoopState(rand.Intn(10))) }, } for method, f := range tests { allocs := testing.AllocsPerRun(1000, f) diff --git a/p2p/host/autorelay/relay_finder.go b/p2p/host/autorelay/relay_finder.go index 727ae4bc60..4acc9af349 100644 --- a/p2p/host/autorelay/relay_finder.go +++ b/p2p/host/autorelay/relay_finder.go @@ -291,6 +291,7 @@ func (rf *relayFinder) clearOldCandidates(now time.Time) time.Time { } } else { log.Debugw("deleting candidate due to age", "id", id) + deleted = true rf.removeCandidate(id) } } @@ -337,6 +338,8 @@ func (rf *relayFinder) findNodes(ctx context.Context, peerSourceRateLimiter <-ch rf.candidateMx.Unlock() if peerChan == nil && numCandidates < rf.conf.minCandidates { + rf.metricsTracer.CandidateLoopState(peerSourceRateLimited) + select { case <-peerSourceRateLimiter: peerChan = rf.peerSource(ctx, rf.conf.maxCandidates) @@ -349,6 +352,12 @@ func (rf *relayFinder) findNodes(ctx context.Context, peerSourceRateLimiter <-ch } } + if peerChan == nil { + rf.metricsTracer.CandidateLoopState(waitingForTrigger) + } else { + rf.metricsTracer.CandidateLoopState(waitingOnPeerChan) + } + select { case <-rf.maybeRequestNewCandidates: continue @@ -381,6 +390,7 @@ func (rf *relayFinder) findNodes(ctx context.Context, peerSourceRateLimiter <-ch } }() case <-ctx.Done(): + rf.metricsTracer.CandidateLoopState(stopped) return } } From b78104daa98c367a90389c4c691b5ca095509184 Mon Sep 17 00:00:00 2001 From: sukun Date: Fri, 24 Mar 2023 11:18:32 +0530 Subject: [PATCH 07/10] fix logging --- p2p/host/autorelay/autorelay.go | 6 +++++- p2p/host/autorelay/relay_finder.go | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/p2p/host/autorelay/autorelay.go b/p2p/host/autorelay/autorelay.go index 580cfaa9a0..5900798533 100644 --- a/p2p/host/autorelay/autorelay.go +++ b/p2p/host/autorelay/autorelay.go @@ -2,6 +2,7 @@ package autorelay import ( "context" + "errors" "sync" "github.com/libp2p/go-libp2p/core/event" @@ -83,7 +84,10 @@ func (r *AutoRelay) background() { evt := ev.(event.EvtLocalReachabilityChanged) switch evt.Reachability { case network.ReachabilityPrivate, network.ReachabilityUnknown: - if err := r.relayFinder.Start(); err != nil { + err := r.relayFinder.Start() + if errors.Is(err, errAlreadyRunning) { + log.Debug("tried to start already running relay finder") + } else if err != nil { log.Errorw("failed to start relay finder", "error", err) } else { r.metricsTracer.RelayFinderStatus(true) diff --git a/p2p/host/autorelay/relay_finder.go b/p2p/host/autorelay/relay_finder.go index 4acc9af349..2505063730 100644 --- a/p2p/host/autorelay/relay_finder.go +++ b/p2p/host/autorelay/relay_finder.go @@ -83,6 +83,8 @@ type relayFinder struct { metricsTracer MetricsTracer } +var errAlreadyRunning = errors.New("relayFinder already running") + func newRelayFinder(host *basic.BasicHost, peerSource PeerSource, conf *config) *relayFinder { if peerSource == nil { panic("Can not create a new relayFinder. Need a Peer Source fn or a list of static relays. Refer to the documentation around `libp2p.EnableAutoRelay`") @@ -754,7 +756,7 @@ func (rf *relayFinder) Start() error { rf.ctxCancelMx.Lock() defer rf.ctxCancelMx.Unlock() if rf.ctxCancel != nil { - return errors.New("relayFinder already running") + return errAlreadyRunning } log.Debug("starting relay finder") ctx, cancel := context.WithCancel(context.Background()) From 43d05407dbc35b76f01adebe62e365899df66104 Mon Sep 17 00:00:00 2001 From: Sukun Date: Wed, 5 Apr 2023 14:20:48 +0530 Subject: [PATCH 08/10] reset metrics on relayfinder stop --- p2p/host/autorelay/metrics.go | 41 ++++++++++++++-------- p2p/host/autorelay/metrics_noalloc_test.go | 7 ++-- p2p/host/autorelay/relay_finder.go | 40 +++++++++++++++++---- 3 files changed, 64 insertions(+), 24 deletions(-) diff --git a/p2p/host/autorelay/metrics.go b/p2p/host/autorelay/metrics.go index 52da331884..8028655b01 100644 --- a/p2p/host/autorelay/metrics.go +++ b/p2p/host/autorelay/metrics.go @@ -124,15 +124,16 @@ const ( type MetricsTracer interface { RelayFinderStatus(isActive bool) - ReservationEnded() + ReservationEnded(cnt int) + ReservationOpened(cnt int) ReservationRequestFinished(isRefresh bool, err error) RelayAddressCount(int) RelayAddressUpdated() CandidateChecked(supportsCircuitV2 bool) - CandidateAdded() - CandidateRemoved() + CandidateAdded(cnt int) + CandidateRemoved(cnt int) CandidateLoopState(state candidateLoopState) ScheduledWorkUpdated(scheduledWork *scheduledWorkTimes) @@ -182,8 +183,12 @@ func (mt *metricsTracer) RelayFinderStatus(isActive bool) { } } -func (mt *metricsTracer) ReservationEnded() { - reservationsClosedTotal.Inc() +func (mt *metricsTracer) ReservationEnded(cnt int) { + reservationsClosedTotal.Add(float64(cnt)) +} + +func (mt *metricsTracer) ReservationOpened(cnt int) { + reservationsOpenedTotal.Add(float64(cnt)) } func (mt *metricsTracer) ReservationRequestFinished(isRefresh bool, err error) { @@ -222,18 +227,18 @@ func (mt *metricsTracer) CandidateChecked(supportsCircuitV2 bool) { candidatesCircuitV2SupportTotal.WithLabelValues(*tags...).Inc() } -func (mt *metricsTracer) CandidateAdded() { +func (mt *metricsTracer) CandidateAdded(cnt int) { tags := metricshelper.GetStringSlice() defer metricshelper.PutStringSlice(tags) *tags = append(*tags, "added") - candidatesTotal.WithLabelValues(*tags...).Inc() + candidatesTotal.WithLabelValues(*tags...).Add(float64(cnt)) } -func (mt *metricsTracer) CandidateRemoved() { +func (mt *metricsTracer) CandidateRemoved(cnt int) { tags := metricshelper.GetStringSlice() defer metricshelper.PutStringSlice(tags) *tags = append(*tags, "removed") - candidatesTotal.WithLabelValues(*tags...).Inc() + candidatesTotal.WithLabelValues(*tags...).Add(float64(cnt)) } func (mt *metricsTracer) CandidateLoopState(state candidateLoopState) { @@ -301,9 +306,15 @@ func (mt *wrappedMetricsTracer) RelayFinderStatus(isActive bool) { } } -func (mt *wrappedMetricsTracer) ReservationEnded() { +func (mt *wrappedMetricsTracer) ReservationEnded(cnt int) { + if mt.mt != nil { + mt.mt.ReservationEnded(cnt) + } +} + +func (mt *wrappedMetricsTracer) ReservationOpened(cnt int) { if mt.mt != nil { - mt.mt.ReservationEnded() + mt.mt.ReservationOpened(cnt) } } @@ -331,15 +342,15 @@ func (mt *wrappedMetricsTracer) CandidateChecked(supportsCircuitV2 bool) { } } -func (mt *wrappedMetricsTracer) CandidateAdded() { +func (mt *wrappedMetricsTracer) CandidateAdded(cnt int) { if mt.mt != nil { - mt.mt.CandidateAdded() + mt.mt.CandidateAdded(cnt) } } -func (mt *wrappedMetricsTracer) CandidateRemoved() { +func (mt *wrappedMetricsTracer) CandidateRemoved(cnt int) { if mt.mt != nil { - mt.mt.CandidateRemoved() + mt.mt.CandidateRemoved(cnt) } } diff --git a/p2p/host/autorelay/metrics_noalloc_test.go b/p2p/host/autorelay/metrics_noalloc_test.go index 2a8a7d5461..a44cb6e459 100644 --- a/p2p/host/autorelay/metrics_noalloc_test.go +++ b/p2p/host/autorelay/metrics_noalloc_test.go @@ -37,13 +37,14 @@ func TestMetricsNoAllocNoCover(t *testing.T) { tr := NewMetricsTracer() tests := map[string]func(){ "RelayFinderStatus": func() { tr.RelayFinderStatus(rand.Intn(2) == 1) }, - "ReservationEnded": func() { tr.ReservationEnded() }, + "ReservationEnded": func() { tr.ReservationEnded(rand.Intn(10)) }, "ReservationRequestFinished": func() { tr.ReservationRequestFinished(rand.Intn(2) == 1, errs[rand.Intn(len(errs))]) }, "RelayAddressCount": func() { tr.RelayAddressCount(rand.Intn(10)) }, "RelayAddressUpdated": func() { tr.RelayAddressUpdated() }, + "ReservationOpened": func() { tr.ReservationOpened(rand.Intn(10)) }, "CandidateChecked": func() { tr.CandidateChecked(rand.Intn(2) == 1) }, - "CandidateAdded": func() { tr.CandidateAdded() }, - "CandidateRemoved": func() { tr.CandidateRemoved() }, + "CandidateAdded": func() { tr.CandidateAdded(rand.Intn(10)) }, + "CandidateRemoved": func() { tr.CandidateRemoved(rand.Intn(10)) }, "ScheduledWorkUpdated": func() { tr.ScheduledWorkUpdated(&scheduledWork[rand.Intn(len(scheduledWork))]) }, "DesiredReservations": func() { tr.DesiredReservations(rand.Intn(10)) }, "CandidateLoopState": func() { tr.CandidateLoopState(candidateLoopState(rand.Intn(10))) }, diff --git a/p2p/host/autorelay/relay_finder.go b/p2p/host/autorelay/relay_finder.go index 2505063730..3133b7a51b 100644 --- a/p2p/host/autorelay/relay_finder.go +++ b/p2p/host/autorelay/relay_finder.go @@ -140,8 +140,6 @@ func (rf *relayFinder) background(ctx context.Context) { bootDelayTimer := rf.conf.clock.InstantTimer(now.Add(rf.conf.bootDelay)) defer bootDelayTimer.Stop() - rf.metricsTracer.DesiredReservations(rf.conf.desiredRelays) - // This is the least frequent event. It's our fallback timer if we don't have any other work to do. leastFrequentInterval := rf.conf.minInterval // Check if leastFrequentInterval is 0 to avoid busy looping @@ -190,7 +188,7 @@ func (rf *relayFinder) background(ctx context.Context) { if push { rf.clearCachedAddrsAndSignalAddressChange() - rf.metricsTracer.ReservationEnded() + rf.metricsTracer.ReservationEnded(1) } case <-rf.candidateFound: rf.notifyMaybeConnectToRelay() @@ -656,7 +654,7 @@ func (rf *relayFinder) refreshRelayReservation(ctx context.Context, p peer.ID) e rf.host.ConnManager().Unprotect(p, autorelayTag) rf.relayMx.Unlock() if exists { - rf.metricsTracer.ReservationEnded() + rf.metricsTracer.ReservationEnded(1) } return err } @@ -678,7 +676,7 @@ func (rf *relayFinder) addCandidate(cand *candidate) { _, exists := rf.candidates[cand.ai.ID] rf.candidates[cand.ai.ID] = cand if !exists { - rf.metricsTracer.CandidateAdded() + rf.metricsTracer.CandidateAdded(1) } } @@ -686,7 +684,7 @@ func (rf *relayFinder) removeCandidate(id peer.ID) { _, exists := rf.candidates[id] if exists { delete(rf.candidates, id) - rf.metricsTracer.CandidateRemoved() + rf.metricsTracer.CandidateRemoved(1) } } @@ -759,6 +757,9 @@ func (rf *relayFinder) Start() error { return errAlreadyRunning } log.Debug("starting relay finder") + + rf.initMetrics() + ctx, cancel := context.WithCancel(context.Background()) rf.ctxCancel = cancel rf.refCount.Add(1) @@ -778,5 +779,32 @@ func (rf *relayFinder) Stop() error { } rf.refCount.Wait() rf.ctxCancel = nil + + rf.resetMetrics() return nil } + +func (rf *relayFinder) initMetrics() { + rf.metricsTracer.DesiredReservations(rf.conf.desiredRelays) + + rf.relayMx.Lock() + rf.metricsTracer.ReservationOpened(len(rf.relays)) + rf.relayMx.Unlock() + + rf.candidateMx.Lock() + rf.metricsTracer.CandidateAdded(len(rf.candidates)) + rf.candidateMx.Unlock() +} + +func (rf *relayFinder) resetMetrics() { + rf.relayMx.Lock() + rf.metricsTracer.ReservationEnded(len(rf.relays)) + rf.relayMx.Unlock() + + rf.candidateMx.Lock() + rf.metricsTracer.CandidateRemoved(len(rf.candidates)) + rf.candidateMx.Unlock() + + rf.metricsTracer.RelayAddressCount(0) + rf.metricsTracer.ScheduledWorkUpdated(&scheduledWorkTimes{}) +} From d5f9c75c90ac1c886c6bd4f7969f300867f533b0 Mon Sep 17 00:00:00 2001 From: Sukun Date: Wed, 5 Apr 2023 14:40:09 +0530 Subject: [PATCH 09/10] update dashboard --- dashboards/autorelay/autorelay.json | 59 +++++++++++++++++++---------- 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/dashboards/autorelay/autorelay.json b/dashboards/autorelay/autorelay.json index 77be144342..be2b3aadd7 100644 --- a/dashboards/autorelay/autorelay.json +++ b/dashboards/autorelay/autorelay.json @@ -21,7 +21,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "9.4.1" + "version": "9.4.7" }, { "type": "panel", @@ -236,7 +236,7 @@ "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "9.4.1", + "pluginVersion": "9.4.7", "targets": [ { "datasource": { @@ -567,7 +567,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.4.1", + "pluginVersion": "9.4.7", "targets": [ { "datasource": { @@ -689,17 +689,25 @@ "fixedColor": "blue", "mode": "fixed" }, - "mappings": [], + "mappings": [ + { + "options": { + "from": null, + "result": { + "index": 0, + "text": "-" + }, + "to": 0 + }, + "type": "range" + } + ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] }, @@ -716,9 +724,9 @@ "id": 26, "options": { "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" @@ -728,7 +736,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.4.1", + "pluginVersion": "9.4.7", "targets": [ { "datasource": { @@ -736,9 +744,11 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", + "exemplar": false, "expr": "libp2p_autorelay_scheduled_work_time{work_type=\"old candidate check\"} * 1000\n", + "instant": true, "legendFormat": "__auto", - "range": true, + "range": false, "refId": "A" } ], @@ -753,20 +763,31 @@ "fieldConfig": { "defaults": { "color": { - "fixedColor": "purple", + "fixedColor": "blue", "mode": "fixed" }, "mappings": [ { "options": { + "from": -86400, "result": { - "color": "blue", - "index": 1, + "index": 0, "text": "immediately" }, "to": 0 }, "type": "range" + }, + { + "options": { + "from": null, + "result": { + "index": 1, + "text": "-" + }, + "to": -86400 + }, + "type": "range" } ], "thresholds": { @@ -803,7 +824,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.4.1", + "pluginVersion": "9.4.7", "targets": [ { "datasource": { @@ -880,7 +901,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.4.1", + "pluginVersion": "9.4.7", "targets": [ { "datasource": { @@ -1022,6 +1043,6 @@ "timezone": "", "title": "libp2p Autorelay", "uid": "deQ_uf-4k", - "version": 9, + "version": 14, "weekStart": "" } \ No newline at end of file From 9da1df8f16f5826c44ad0991c2cc0847cfb73eba Mon Sep 17 00:00:00 2001 From: Sukun Date: Wed, 5 Apr 2023 15:00:26 +0530 Subject: [PATCH 10/10] update dashboard --- dashboards/autorelay/autorelay.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dashboards/autorelay/autorelay.json b/dashboards/autorelay/autorelay.json index be2b3aadd7..26dbcb80e2 100644 --- a/dashboards/autorelay/autorelay.json +++ b/dashboards/autorelay/autorelay.json @@ -692,12 +692,12 @@ "mappings": [ { "options": { - "from": null, + "from": -9223372036854776000, "result": { "index": 0, "text": "-" }, - "to": 0 + "to": -86400 }, "type": "range" } @@ -769,7 +769,7 @@ "mappings": [ { "options": { - "from": -86400, + "from": -86399, "result": { "index": 0, "text": "immediately" @@ -780,7 +780,7 @@ }, { "options": { - "from": null, + "from": -9223372036854776000, "result": { "index": 1, "text": "-" @@ -1043,6 +1043,6 @@ "timezone": "", "title": "libp2p Autorelay", "uid": "deQ_uf-4k", - "version": 14, + "version": 6, "weekStart": "" } \ No newline at end of file