Skip to content

Commit

Permalink
Add some simple device metrics to dashboard (#220)
Browse files Browse the repository at this point in the history
  • Loading branch information
AlCutter committed Sep 19, 2024
1 parent bda5cb7 commit 199c96c
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 1 deletion.
1 change: 1 addition & 0 deletions deployment/live/monitoring/ci/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ inputs = merge(
include.root.locals,
{
alert_lt_num_witness_threshold = 0
num_expected_devices = 5
}
)

1 change: 1 addition & 0 deletions deployment/live/monitoring/dev/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ inputs = merge(
include.root.locals,
{
alert_lt_num_witness_threshold = 0
num_expected_devices = 2
}
)

1 change: 1 addition & 0 deletions deployment/live/monitoring/prod/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ inputs = merge(
{
alert_lt_num_witness_threshold = 10
alert_enable_num_witness = false
num_expected_devices = 15
}
)

35 changes: 35 additions & 0 deletions deployment/modules/monitoring/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,41 @@ resource "google_monitoring_dashboard" "witness_dashboard" {
}
}
},
{
"title": "Devices seen online",
"xyChart": {
"dataSets": [{
"timeSeriesQuery": {
"prometheusQuery": "count by (witness_id) (max by (instance_id, witness_id) (rate(distributor_update_checkpoint_request{configuration_name='distributor-service-${var.env}'}[$${__interval}]) > bool 0))"
},
"plotType": "STACKED_AREA"
}],
"timeshiftDuration": "0s",
"yAxis": {
"label": "Devices",
"scale": "LINEAR"
}
}
},
{
"title": "% online (assuming ${var.num_expected_devices} devices)",
"xyChart": {
"dataSets": [{
"timeSeriesQuery": {
"prometheusQuery": "count by (instance_id) (max by (instance_id, witness_id) (rate(distributor_update_checkpoint_request{configuration_name='distributor-service-${var.env}'}[$${__interval}]) > bool 0)) * 100 / ${var.num_expected_devices}"
},
"plotType": "STACKED_AREA"
}],
"thresholds": [{
"value": 51
}],
"timeshiftDuration": "0s",
"yAxis": {
"label": "%",
"scale": "LINEAR"
}
}
},
{
"title": "Witness liveness alert chart",
"alertChart": {
Expand Down
7 changes: 6 additions & 1 deletion deployment/modules/monitoring/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ variable "env" {
type = string
}

variable "num_expected_devices" {
description = "Number of expected devices"
type = number
}

variable "alert_lt_num_witness_threshold" {
description = "The lower bound alert threshold for the number of live witnesses, as measured by the distributor_update_checkpoint_success Prometheus metric."
type = number
Expand All @@ -37,4 +42,4 @@ variable "alert_lt_num_witness_threshold" {
variable "alert_enable_num_witness" {
description = "Whether to enable alert_lt_num_witness_threshold."
type = bool
}
}

0 comments on commit 199c96c

Please sign in to comment.