Skip to content

Commit

Permalink
Merge pull request #9 from icey-yu/prometheus
Browse files Browse the repository at this point in the history
feat: prometheus
  • Loading branch information
withchao committed Jul 15, 2024
2 parents 6a6c712 + 4738d17 commit 14c486b
Show file tree
Hide file tree
Showing 15 changed files with 315 additions and 118 deletions.
3 changes: 3 additions & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ ZOOKEEPER_IMAGE=bitnami/zookeeper:3.8
KAFKA_IMAGE=bitnami/kafka:3.5.1
MINIO_IMAGE=minio/minio:RELEASE.2024-01-11T07-46-16Z
ETCD_IMAGE=quay.io/coreos/etcd:v3.5.13
PROMETHEUS_IMAGE=prom/prometheus:v2.45.6
ALERTMANAGER_IMAGE=prom/alertmanager:v0.27.0
GRAFANA_IMAGE=grafana/grafana:11.0.1

OPENIM_WEB_FRONT_IMAGE=openim/openim-web-front:release-v3.5.1
OPENIM_ADMIN_FRONT_IMAGE=openim/openim-admin-front:release-v1.7
Expand Down
4 changes: 0 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,7 @@ deployments/charts/generated-configs/
### OpenIM Config ###
.env
config/config.yaml
config/alertmanager.yml
config/prometheus.yml
config/email.tmpl
config/notification.yaml
config/instance-down-rules.yml

### OpenIM deploy ###
deployments/openim-server/charts
Expand Down
25 changes: 25 additions & 0 deletions config/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
global:
resolve_timeout: 5m
smtp_from: alert@openim.io
smtp_smarthost: smtp.163.com:465
smtp_auth_username: alert@openim.io
smtp_auth_password: YOURAUTHPASSWORD
smtp_require_tls: false
smtp_hello: xxx

templates:
- /etc/alertmanager/email.tmpl

route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: email
receivers:
- name: email
email_configs:
- to: 'alert@example.com'
html: '{{ template "email.to.html" . }}'
headers: { Subject: "[OPENIM-SERVER]Alarm" }
send_resolved: true
16 changes: 16 additions & 0 deletions config/email.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{{ define "email.to.html" }}
{{ range .Alerts }}
<!-- Begin of OpenIM Alert -->
<div style="border:1px solid #ccc; padding:10px; margin-bottom:10px;">
<h3>OpenIM Alert</h3>
<p><strong>Alert Program:</strong> Prometheus Alert</p>
<p><strong>Severity Level:</strong> {{ .Labels.severity }}</p>
<p><strong>Alert Type:</strong> {{ .Labels.alertname }}</p>
<p><strong>Affected Host:</strong> {{ .Labels.instance }}</p>
<p><strong>Affected Service:</strong> {{ .Labels.job }}</p>
<p><strong>Alert Subject:</strong> {{ .Annotations.summary }}</p>
<p><strong>Trigger Time:</strong> {{ .StartsAt.Format "2006-01-02 15:04:05" }}</p>
</div>
<!-- End of OpenIM Alert -->
{{ end }}
{{ end }}
22 changes: 22 additions & 0 deletions config/instance-down-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
groups:
- name: instance_down
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."

- name: database_insert_failure_alerts
rules:
- alert: DatabaseInsertFailed
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0)
for: 1m
labels:
severity: critical
annotations:
summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected"
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter has increased in the last 5 minutes, indicating failures in message insert operations to Redis or MongoDB,maybe the redis or mongodb is crash."
83 changes: 83 additions & 0 deletions config/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['192.168.2.22:19093']

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "instance-down-rules.yml"
# - "first_rules.yml"
# - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label "job='job_name'"" to any timeseries scraped from this config.
# Monitored information captured by prometheus

# prometheus fetches application services
- job_name: 'node_exporter'
static_configs:
- targets: [ '192.168.2.22:20114' ]
- job_name: 'openimserver-openim-api'
static_configs:
- targets: [ '192.168.2.22:20113' ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-msggateway'
static_configs:
- targets: [ '192.168.2.22:20112' ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-msgtransfer'
static_configs:
- targets: [ 192.168.2.22:20111, 192.168.2.22:20110, 192.168.2.22:20109, 192.168.2.22:20108 ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-push'
static_configs:
- targets: [ '192.168.2.22:20107' ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-rpc-auth'
static_configs:
- targets: [ '192.168.2.22:20106' ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-rpc-conversation'
static_configs:
- targets: [ '192.168.2.22:20105' ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-rpc-friend'
static_configs:
- targets: [ '192.168.2.22:20104' ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-rpc-group'
static_configs:
- targets: [ '192.168.2.22:20103' ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-rpc-msg'
static_configs:
- targets: [ '192.168.2.22:20102' ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-rpc-third'
static_configs:
- targets: [ '192.168.2.22:20101' ]
labels:
namespace: 'default'
- job_name: 'openimserver-openim-rpc-user'
static_configs:
- targets: [ '192.168.2.22:20100' ]
labels:
namespace: 'default'
45 changes: 45 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,5 +140,50 @@ services:
networks:
- openim

prometheus:
image: ${PROMETHEUS_IMAGE}
container_name: prometheus
restart: always
volumes:
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml
- ./config/instance-down-rules.yml:/etc/prometheus/instance-down-rules.yml
- ${DATA_DIR}/components/prometheus/data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
ports:
- "19091:9090"
networks:
- openim

alertmanager:
image: ${ALERTMANAGER_IMAGE}
container_name: alertmanager
restart: always
volumes:
- ./config/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- ./config/email.tmpl:/etc/alertmanager/email.tmpl
ports:
- "19093:9093"
networks:
- openim

grafana:
image: ${GRAFANA_IMAGE}
container_name: grafana
user: root
restart: always
environment:
- GF_SECURITY_ALLOW_EMBEDDING=true
- GF_SESSION_COOKIE_SAMESITE=none
- GF_SESSION_COOKIE_SECURE=true
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
ports:
- "13000:3000"
volumes:
- ${DATA_DIR:-./}/components/grafana:/var/lib/grafana
networks:
- openim


4 changes: 1 addition & 3 deletions internal/api/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,7 @@ func Start(ctx context.Context, index int, config *Config) error {
netDone <- struct{}{}
return
}
srv := http.NewServeMux()
srv.Handle(prommetrics.ApiPath, prommetrics.ApiHandler())
if err := http.ListenAndServe(fmt.Sprintf(":%d", prometheusPort), srv); err != nil && err != http.ErrServerClosed {
if err := prommetrics.ApiInit(prometheusPort); err != nil && err != http.ErrServerClosed {
netErr = errs.WrapMsg(err, fmt.Sprintf("api prometheus start err: %d", prometheusPort))
netDone <- struct{}{}
}
Expand Down
15 changes: 3 additions & 12 deletions internal/msgtransfer/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package msgtransfer
import (
"context"
"fmt"
"github.com/openimsdk/open-im-server/v3/pkg/common/prommetrics"
"github.com/openimsdk/open-im-server/v3/pkg/common/storage/cache/redis"
"github.com/openimsdk/open-im-server/v3/pkg/common/storage/database/mgo"
"github.com/openimsdk/tools/db/mongoutil"
Expand All @@ -29,16 +30,12 @@ import (

"github.com/openimsdk/open-im-server/v3/pkg/common/config"
kdisc "github.com/openimsdk/open-im-server/v3/pkg/common/discoveryregister"
"github.com/openimsdk/open-im-server/v3/pkg/common/prommetrics"
"github.com/openimsdk/open-im-server/v3/pkg/common/storage/controller"
"github.com/openimsdk/open-im-server/v3/pkg/rpcclient"
"github.com/openimsdk/tools/errs"
"github.com/openimsdk/tools/log"
"github.com/openimsdk/tools/mw"
"github.com/openimsdk/tools/system/program"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/promhttp"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
)
Expand Down Expand Up @@ -139,14 +136,8 @@ func (m *MsgTransfer) Start(index int, config *Config) error {
netDone <- struct{}{}
return
}
proreg := prometheus.NewRegistry()
proreg.MustRegister(
collectors.NewGoCollector(),
)
proreg.MustRegister(prommetrics.GetGrpcCusMetrics("Transfer", &config.Share)...)
http.Handle("/metrics", promhttp.HandlerFor(proreg, promhttp.HandlerOpts{Registry: proreg}))
err = http.ListenAndServe(fmt.Sprintf(":%d", prometheusPort), nil)
if err != nil && err != http.ErrServerClosed {

if err := prommetrics.TransferInit(prometheusPort); err != nil && err != http.ErrServerClosed {
netErr = errs.WrapMsg(err, "prometheus start error", "prometheusPort", prometheusPort)
netDone <- struct{}{}
}
Expand Down
20 changes: 9 additions & 11 deletions pkg/common/prommetrics/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,11 @@ package prommetrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"net/http"
"strconv"
)

const ApiPath = "/metrics"

var (
apiRegistry = prometheus.NewRegistry()
apiCounter = prometheus.NewCounterVec(
apiCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "api_count",
Help: "Total number of API calls",
Expand All @@ -27,8 +23,14 @@ var (
)
)

func init() {
apiRegistry.MustRegister(apiCounter, httpCounter)
func ApiInit(prometheusPort int) error {
apiRegistry := prometheus.NewRegistry()
cs := append(
baseCollector,
apiCounter,
httpCounter,
)
return Init(apiRegistry, prometheusPort, commonPath, promhttp.HandlerFor(apiRegistry, promhttp.HandlerOpts{}), cs...)
}

func APICall(path string, method string, apiCode int) {
Expand All @@ -44,7 +46,3 @@ func HttpCall(path string, method string, status int) {
// apiRegistry, promhttp.HandlerFor(apiRegistry, promhttp.HandlerOpts{}),
// )
//}

func ApiHandler() http.Handler {
return promhttp.HandlerFor(apiRegistry, promhttp.HandlerOpts{})
}
47 changes: 14 additions & 33 deletions pkg/common/prommetrics/prommetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,43 +15,24 @@
package prommetrics

import (
gp "github.com/grpc-ecosystem/go-grpc-prometheus"
config2 "github.com/openimsdk/open-im-server/v3/pkg/common/config"
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"net/http"
)

func NewGrpcPromObj(cusMetrics []prometheus.Collector) (*prometheus.Registry, *gp.ServerMetrics, error) {
reg := prometheus.NewRegistry()
grpcMetrics := gp.NewServerMetrics()
grpcMetrics.EnableHandlingTimeHistogram()
cusMetrics = append(cusMetrics, grpcMetrics, collectors.NewGoCollector())
reg.MustRegister(cusMetrics...)
return reg, grpcMetrics, nil
}
const commonPath = "/metrics"

func GetGrpcCusMetrics(registerName string, share *config2.Share) []prometheus.Collector {
switch registerName {
case share.RpcRegisterName.MessageGateway:
return []prometheus.Collector{OnlineUserGauge}
case share.RpcRegisterName.Msg:
return []prometheus.Collector{SingleChatMsgProcessSuccessCounter, SingleChatMsgProcessFailedCounter, GroupChatMsgProcessSuccessCounter, GroupChatMsgProcessFailedCounter}
case "Transfer":
return []prometheus.Collector{MsgInsertRedisSuccessCounter, MsgInsertRedisFailedCounter, MsgInsertMongoSuccessCounter, MsgInsertMongoFailedCounter, SeqSetFailedCounter}
case share.RpcRegisterName.Push:
return []prometheus.Collector{MsgOfflinePushFailedCounter}
case share.RpcRegisterName.Auth:
return []prometheus.Collector{UserLoginCounter}
default:
return nil
var (
baseCollector = []prometheus.Collector{
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
collectors.NewGoCollector(),
}
}
)

//func GetGinCusMetrics(name string) []*ginprometheus.Metric {
// switch name {
// case "Api":
// return []*ginprometheus.Metric{ApiCustomCnt}
// default:
// return []*ginprometheus.Metric{ApiCustomCnt}
// }
//}
func Init(registry *prometheus.Registry, prometheusPort int, path string, handler http.Handler, cs ...prometheus.Collector) error {
registry.MustRegister(cs...)
srv := http.NewServeMux()
srv.Handle(path, handler)
return http.ListenAndServe(fmt.Sprintf(":%d", prometheusPort), srv)
}
Loading

0 comments on commit 14c486b

Please sign in to comment.