dragonfly/scheduler/metrics/metrics.go

455 lines
16 KiB
Go

/*
* Copyright 2020 The Dragonfly Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package metrics
import (
"net/http"
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
"google.golang.org/grpc"
"d7y.io/dragonfly/v2/pkg/types"
"d7y.io/dragonfly/v2/scheduler/config"
"d7y.io/dragonfly/v2/version"
)
var (
// HostTrafficUploadType is upload traffic type for host traffic metrics.
HostTrafficUploadType = "upload"
// HostTrafficDownloadType is download traffic type for host traffic metrics.
HostTrafficDownloadType = "download"
)
// Variables declared for metrics.
var (
AnnouncePeerCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "announce_peer_total",
Help: "Counter of the number of the announcing peer.",
})
AnnouncePeerFailureCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "announce_peer_failure_total",
Help: "Counter of the number of failed of the announcing peer.",
})
StatPeerCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "stat_peer_total",
Help: "Counter of the number of the stat peer.",
})
StatPeerFailureCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "stat_peer_failure_total",
Help: "Counter of the number of failed of the stat peer.",
})
LeavePeerCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "leave_peer_total",
Help: "Counter of the number of the leaving peer.",
})
LeavePeerFailureCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "leave_peer_failure_total",
Help: "Counter of the number of failed of the leaving peer.",
})
ExchangePeerCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "exchange_peer_total",
Help: "Counter of the number of the exchanging peer.",
})
ExchangePeerFailureCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "exchange_peer_failure_total",
Help: "Counter of the number of failed of the exchanging peer.",
})
RegisterPeerCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "register_peer_total",
Help: "Counter of the number of the register peer.",
}, []string{"priority", "task_type", "task_tag", "task_app", "host_type"})
RegisterPeerFailureCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "register_peer_failure_total",
Help: "Counter of the number of failed of the register peer.",
}, []string{"priority", "task_type", "task_tag", "task_app", "host_type"})
DownloadPeerStartedCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_peer_started_total",
Help: "Counter of the number of the download peer started.",
}, []string{"priority", "task_type", "task_tag", "task_app", "host_type"})
DownloadPeerStartedFailureCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_peer_started_failure_total",
Help: "Counter of the number of failed of the download peer started.",
}, []string{"priority", "task_type", "task_tag", "task_app", "host_type"})
DownloadPeerBackToSourceStartedCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_peer_back_to_source_started_total",
Help: "Counter of the number of the download peer back-to-source started.",
}, []string{"priority", "task_type", "task_tag", "task_app", "host_type"})
DownloadPeerBackToSourceStartedFailureCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_peer_back_to_source_started_failure_total",
Help: "Counter of the number of failed of the download peer back-to-source started.",
}, []string{"priority", "task_type", "task_tag", "task_app", "host_type"})
DownloadPeerCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_peer_finished_total",
Help: "Counter of the number of the download peer.",
}, []string{"priority", "task_type", "task_tag", "task_app", "host_type"})
DownloadPeerFailureCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_peer_finished_failure_total",
Help: "Counter of the number of failed of the download peer.",
}, []string{"priority", "task_type", "task_tag", "task_app", "host_type"})
DownloadPeerBackToSourceFailureCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_peer_back_to_source_finished_failure_total",
Help: "Counter of the number of failed of the download peer back-to-source.",
}, []string{"priority", "task_type", "task_tag", "task_app", "host_type"})
DownloadPieceCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_piece_finished_total",
Help: "Counter of the number of the download piece.",
}, []string{"traffic_type", "task_type", "task_tag", "task_app", "host_type"})
DownloadPieceFailureCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_piece_finished_failure_total",
Help: "Counter of the number of failed of the download piece.",
}, []string{"traffic_type", "task_type", "task_tag", "task_app", "host_type"})
DownloadPieceBackToSourceFailureCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_piece_back_to_source_finished_failure_total",
Help: "Counter of the number of failed of the download piece back-to-source.",
}, []string{"traffic_type", "task_type", "task_tag", "task_app", "host_type"})
StatTaskCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "stat_task_total",
Help: "Counter of the number of the stat task.",
})
StatTaskFailureCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "stat_task_failure_total",
Help: "Counter of the number of failed of the stat task.",
})
AnnounceHostCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "announce_host_total",
Help: "Counter of the number of the announce host.",
}, []string{"os", "platform", "platform_family", "platform_version",
"kernel_version", "git_version", "git_commit", "go_version", "build_platform"})
AnnounceHostFailureCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "announce_host_failure_total",
Help: "Counter of the number of failed of the announce host.",
}, []string{"os", "platform", "platform_family", "platform_version",
"kernel_version", "git_version", "git_commit", "go_version", "build_platform"})
LeaveHostCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "leave_host_total",
Help: "Counter of the number of the leaving host.",
})
LeaveHostFailureCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "leave_host_failure_total",
Help: "Counter of the number of failed of the leaving host.",
})
SyncProbesCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "sync_probes_total",
Help: "Counter of the number of the synchronizing probes.",
})
SyncProbesFailureCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "sync_probes_failure_total",
Help: "Counter of the number of failed of the synchronizing probes.",
})
Traffic = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "traffic",
Help: "Counter of the number of traffic.",
}, []string{"type", "task_type", "task_tag", "task_app", "host_type"})
HostTraffic = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "host_traffic",
Help: "Counter of the number of per host traffic.",
}, []string{"type", "task_type", "task_tag", "task_app", "host_type", "host_id", "host_ip", "host_name"})
DownloadPeerDuration = promauto.NewSummaryVec(prometheus.SummaryOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "download_peer_duration_milliseconds",
Help: "Summary of the time each peer downloading.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
}, []string{"task_size_level"})
ConcurrentScheduleGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "concurrent_schedule_total",
Help: "Gauge of the number of concurrent of the scheduling.",
})
VersionGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: types.MetricsNamespace,
Subsystem: types.SchedulerMetricsName,
Name: "version",
Help: "Version info of the service.",
}, []string{"major", "minor", "git_version", "git_commit", "platform", "build_time", "go_version", "go_tags", "go_gcflags"})
)
func New(cfg *config.MetricsConfig, svr *grpc.Server) *http.Server {
grpc_prometheus.Register(svr)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
VersionGauge.WithLabelValues(version.Major, version.Minor, version.GitVersion, version.GitCommit, version.Platform, version.BuildTime, version.GoVersion, version.Gotags, version.Gogcflags).Set(1)
return &http.Server{
Addr: cfg.Addr,
Handler: mux,
}
}
// TaskSizeLevel is the level of the task size.
type TaskSizeLevel int
// String returns the string representation of the TaskSizeLevel.
func (t TaskSizeLevel) String() string {
switch t {
case TaskSizeLevel0:
return "0"
case TaskSizeLevel1:
return "1"
case TaskSizeLevel2:
return "2"
case TaskSizeLevel3:
return "3"
case TaskSizeLevel4:
return "4"
case TaskSizeLevel5:
return "5"
case TaskSizeLevel6:
return "6"
case TaskSizeLevel7:
return "7"
case TaskSizeLevel8:
return "8"
case TaskSizeLevel9:
return "9"
case TaskSizeLevel10:
return "10"
case TaskSizeLevel11:
return "11"
case TaskSizeLevel12:
return "12"
case TaskSizeLevel13:
return "13"
case TaskSizeLevel14:
return "14"
case TaskSizeLevel15:
return "15"
case TaskSizeLevel16:
return "16"
case TaskSizeLevel17:
return "17"
case TaskSizeLevel18:
return "18"
case TaskSizeLevel19:
return "19"
case TaskSizeLevel20:
return "20"
default:
return "0"
}
}
const (
// TaskSizeLevel0 represents unknow size.
TaskSizeLevel0 TaskSizeLevel = iota
// TaskSizeLevel0 represents size range is from 0 to 1M.
TaskSizeLevel1
// TaskSizeLevel1 represents size range is from 1M to 4M.
TaskSizeLevel2
// TaskSizeLevel2 represents size range is from 4M to 8M.
TaskSizeLevel3
// TaskSizeLevel3 represents size range is from 8M to 16M.
TaskSizeLevel4
// TaskSizeLevel4 represents size range is from 16M to 32M.
TaskSizeLevel5
// TaskSizeLevel5 represents size range is from 32M to 64M.
TaskSizeLevel6
// TaskSizeLevel6 represents size range is from 64M to 128M.
TaskSizeLevel7
// TaskSizeLevel7 represents size range is from 128M to 256M.
TaskSizeLevel8
// TaskSizeLevel8 represents size range is from 256M to 512M.
TaskSizeLevel9
// TaskSizeLevel9 represents size range is from 512M to 1G.
TaskSizeLevel10
// TaskSizeLevel10 represents size range is from 1G to 4G.
TaskSizeLevel11
// TaskSizeLevel11 represents size range is from 4G to 8G.
TaskSizeLevel12
// TaskSizeLevel12 represents size range is from 8G to 16G.
TaskSizeLevel13
// TaskSizeLevel13 represents size range is from 16G to 32G.
TaskSizeLevel14
// TaskSizeLevel14 represents size range is from 32G to 64G.
TaskSizeLevel15
// TaskSizeLevel15 represents size range is from 64G to 128G.
TaskSizeLevel16
// TaskSizeLevel16 represents size range is from 128G to 256G.
TaskSizeLevel17
// TaskSizeLevel17 represents size range is from 256G to 512G.
TaskSizeLevel18
// TaskSizeLevel18 represents size range is from 512G to 1T.
TaskSizeLevel19
// TaskSizeLevel20 represents size is greater than 1T.
TaskSizeLevel20
)
// CalculateSizeLevel calculates the size level according to the size.
func CalculateSizeLevel(size int64) TaskSizeLevel {
if size <= 0 {
return TaskSizeLevel0
} else if size < 1024*1024 {
return TaskSizeLevel1
} else if size < 4*1024*1024 {
return TaskSizeLevel2
} else if size < 8*1024*1024 {
return TaskSizeLevel3
} else if size < 16*1024*1024 {
return TaskSizeLevel4
} else if size < 32*1024*1024 {
return TaskSizeLevel5
} else if size < 64*1024*1024 {
return TaskSizeLevel6
} else if size < 128*1024*1024 {
return TaskSizeLevel7
} else if size < 256*1024*1024 {
return TaskSizeLevel8
} else if size < 512*1024*1024 {
return TaskSizeLevel9
} else if size < 1024*1024*1024 {
return TaskSizeLevel10
} else if size < 4*1024*1024*1024 {
return TaskSizeLevel11
} else if size < 8*1024*1024*1024 {
return TaskSizeLevel12
} else if size < 16*1024*1024*1024 {
return TaskSizeLevel13
} else if size < 32*1024*1024*1024 {
return TaskSizeLevel14
} else if size < 64*1024*1024*1024 {
return TaskSizeLevel15
} else if size < 128*1024*1024*1024 {
return TaskSizeLevel16
} else if size < 256*1024*1024*1024 {
return TaskSizeLevel17
} else if size < 512*1024*1024*1024 {
return TaskSizeLevel18
} else if size < 1024*1024*1024*1024 {
return TaskSizeLevel19
} else {
return TaskSizeLevel20
}
}