From 8210eaf94da62ba11e08a054ad698cb10af05879 Mon Sep 17 00:00:00 2001 From: Liangliang Gu Date: Tue, 29 Jun 2021 15:44:06 +0800 Subject: [PATCH] Support metrics with multiple TiSessions with the same port (#220) (#222) Signed-off-by: marsishandsome --- README.md | 46 +- metrics/grafana/client_java_summary.json | 439 ++++++++++++++++++ .../java/org/tikv/common/MetricsServer.java | 93 ++++ src/main/java/org/tikv/common/TiSession.java | 40 +- src/test/java/org/tikv/raw/MetricsTest.java | 83 ++++ 5 files changed, 664 insertions(+), 37 deletions(-) create mode 100644 metrics/grafana/client_java_summary.json create mode 100644 src/main/java/org/tikv/common/MetricsServer.java create mode 100644 src/test/java/org/tikv/raw/MetricsTest.java diff --git a/README.md b/README.md index 7414a8a0ce..7617101fc9 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,16 @@ The following includes JVM related parameters. - timeout of scan/delete range grpc request - default: 20s +### Metrics Parameter + +#### tikv.metrics.enable +- whether to enable metrics exporting +- default: false + +#### tikv.metrics.port +- the metrics exporting http port +- default: 3140 + ### ThreadPool Parameter The following includes ThreadPool related parameters, which can be passed in through JVM parameters. @@ -123,5 +133,39 @@ The following includes ThreadPool related parameters, which can be passed in thr - default: 20 +## Metrics + +Client Java supports exporting metrics to Prometheus using poll mode and viewing on Grafana. The following steps shows how to enable this function. + +### Step 1: Enable metrics exporting + +- set the config `tikv.metrics.enable` to `true` +- call TiConfiguration.setMetricsEnable(true) + +### Step 2: Set the metrics port + +- set the config `tikv.metrics.port` +- call TiConfiguration.setMetricsPort + +Default port is 3140. + +### Step 3: Config Prometheus + +Add the following config to `conf/prometheus.yml` and restart Prometheus. + +```yaml +- job_name: "tikv-client" + honor_labels: true + static_configs: + - targets: + - '127.0.0.1:3140' + - '127.0.0.2:3140' + - '127.0.0.3:3140' +``` + +### Step 4: Config Grafana + +Import the [Client-Java-Summary dashboard config](/metrics/grafana/client_java_summary.json) to Grafana. + ## License -Apache 2.0 license. See the [LICENSE](./LICENSE) file for details. +Apache 2.0 license. See the [LICENSE](./LICENSE) file for details. \ No newline at end of file diff --git a/metrics/grafana/client_java_summary.json b/metrics/grafana/client_java_summary.json new file mode 100644 index 0000000000..2e3419a8dd --- /dev/null +++ b/metrics/grafana/client_java_summary.json @@ -0,0 +1,439 @@ +{ + "__inputs": [ + { + "name": "DS_TEST-CLUSTER", + "label": "test-cluster", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.1.6" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(client_java_raw_requests_latency_count[1m])) by (type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Client QPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(client_java_raw_requests_latency_sum[1m])) by (type) / sum(rate(client_java_raw_requests_latency_count[1m])) by (type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Client Avg Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(client_java_raw_requests_failure_total[1m])) by (type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Client Failures", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(client_java_grpc_raw_requests_latency_sum[1m])) by (type) / sum(rate(client_java_grpc_raw_requests_latency_count[1m])) by (type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Client gRPC Avg Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 18, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Client-Java-Summary", + "uid": "000000911", + "version": 1 +} \ No newline at end of file diff --git a/src/main/java/org/tikv/common/MetricsServer.java b/src/main/java/org/tikv/common/MetricsServer.java new file mode 100644 index 0000000000..a06aeec908 --- /dev/null +++ b/src/main/java/org/tikv/common/MetricsServer.java @@ -0,0 +1,93 @@ +/* + * Copyright 2021 PingCAP, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.tikv.common; + +import io.prometheus.client.CollectorRegistry; +import io.prometheus.client.exporter.HTTPServer; +import java.net.InetSocketAddress; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.tikv.common.policy.RetryPolicy; +import org.tikv.common.region.RegionManager; +import org.tikv.common.region.RegionStoreClient; +import org.tikv.raw.RawKVClient; + +public class MetricsServer { + private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class); + + private static MetricsServer METRICS_SERVER_INSTANCE = null; + private static int metricsServerRefCount = 0; + + private int port; + private HTTPServer server; + private CollectorRegistry collectorRegistry; + + public static MetricsServer getInstance(TiConfiguration conf) { + if (!conf.isMetricsEnable()) { + return null; + } + + synchronized (MetricsServer.class) { + int port = conf.getMetricsPort(); + if (METRICS_SERVER_INSTANCE != null) { + if (port != METRICS_SERVER_INSTANCE.port) { + throw new IllegalArgumentException( + String.format( + "Do dot support multiple tikv.metrics.port, which are %d and %d", + port, METRICS_SERVER_INSTANCE.port)); + } + } else { + METRICS_SERVER_INSTANCE = new MetricsServer(port); + } + metricsServerRefCount += 1; + return METRICS_SERVER_INSTANCE; + } + } + + private MetricsServer(int port) { + try { + this.collectorRegistry = new CollectorRegistry(); + this.collectorRegistry.register(RawKVClient.RAW_REQUEST_LATENCY); + this.collectorRegistry.register(RawKVClient.RAW_REQUEST_FAILURE); + this.collectorRegistry.register(RawKVClient.RAW_REQUEST_SUCCESS); + this.collectorRegistry.register(RegionStoreClient.GRPC_RAW_REQUEST_LATENCY); + this.collectorRegistry.register(RetryPolicy.GRPC_SINGLE_REQUEST_LATENCY); + this.collectorRegistry.register(RegionManager.GET_REGION_BY_KEY_REQUEST_LATENCY); + this.collectorRegistry.register(PDClient.PD_GET_REGION_BY_KEY_REQUEST_LATENCY); + this.port = port; + this.server = new HTTPServer(new InetSocketAddress(port), this.collectorRegistry, true); + logger.info("http server is up " + this.server.getPort()); + } catch (Exception e) { + logger.error("http server not up"); + throw new RuntimeException(e); + } + } + + public void close() { + synchronized (MetricsServer.class) { + if (metricsServerRefCount == 1) { + if (server != null) { + server.stop(); + logger.info("Metrics server on " + server.getPort() + " is stopped"); + } + METRICS_SERVER_INSTANCE = null; + } + + if (metricsServerRefCount >= 1) { + metricsServerRefCount -= 1; + } + } + } +} diff --git a/src/main/java/org/tikv/common/TiSession.java b/src/main/java/org/tikv/common/TiSession.java index 995ede96e2..dc50b6317a 100644 --- a/src/main/java/org/tikv/common/TiSession.java +++ b/src/main/java/org/tikv/common/TiSession.java @@ -20,9 +20,6 @@ import static org.tikv.common.util.ClientUtils.groupKeysByRegion; import com.google.common.annotations.VisibleForTesting; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.protobuf.ByteString; -import io.prometheus.client.CollectorRegistry; -import io.prometheus.client.exporter.HTTPServer; -import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -38,7 +35,6 @@ import org.tikv.common.event.CacheInvalidateEvent; import org.tikv.common.exception.TiKVException; import org.tikv.common.key.Key; import org.tikv.common.meta.TiTimestamp; -import org.tikv.common.policy.RetryPolicy; import org.tikv.common.region.RegionManager; import org.tikv.common.region.RegionStoreClient; import org.tikv.common.region.RegionStoreClient.RegionStoreClientBuilder; @@ -74,37 +70,14 @@ public class TiSession implements AutoCloseable { private volatile boolean enableGrpcForward; private volatile RegionStoreClient.RegionStoreClientBuilder clientBuilder; private boolean isClosed = false; - private HTTPServer server; - private CollectorRegistry collectorRegistry; + private MetricsServer metricsServer; public TiSession(TiConfiguration conf) { this.conf = conf; this.channelFactory = new ChannelFactory(conf.getMaxFrameSize()); this.client = PDClient.createRaw(conf, channelFactory); this.enableGrpcForward = conf.getEnableGrpcForward(); - if (conf.isMetricsEnable()) { - try { - this.collectorRegistry = new CollectorRegistry(); - this.collectorRegistry.register(RawKVClient.RAW_REQUEST_LATENCY); - this.collectorRegistry.register(RawKVClient.RAW_REQUEST_FAILURE); - this.collectorRegistry.register(RawKVClient.RAW_REQUEST_SUCCESS); - this.collectorRegistry.register(RegionStoreClient.GRPC_RAW_REQUEST_LATENCY); - this.collectorRegistry.register(RetryPolicy.GRPC_SINGLE_REQUEST_LATENCY); - this.collectorRegistry.register(RegionManager.GET_REGION_BY_KEY_REQUEST_LATENCY); - this.collectorRegistry.register(PDClient.PD_GET_REGION_BY_KEY_REQUEST_LATENCY); - this.enableGrpcForward = conf.getEnableGrpcForward(); - this.server = - new HTTPServer( - new InetSocketAddress(conf.getMetricsPort()), this.collectorRegistry, true); - logger.info("http server is up " + this.server.getPort()); - } catch (Exception e) { - logger.error("http server not up"); - throw new RuntimeException(e); - } - } - if (this.enableGrpcForward) { - logger.info("enable grpc forward for high available"); - } + this.metricsServer = MetricsServer.getInstance(conf); logger.info("TiSession initialized in " + conf.getKvMode() + " mode"); } @@ -353,10 +326,6 @@ public class TiSession implements AutoCloseable { return channelFactory; } - public CollectorRegistry getCollectorRegistry() { - return collectorRegistry; - } - /** * This is used for setting call back function to invalidate cache information * @@ -468,9 +437,8 @@ public class TiSession implements AutoCloseable { return; } - if (server != null) { - server.stop(); - logger.info("Metrics server on " + server.getPort() + " is stopped"); + if (metricsServer != null) { + metricsServer.close(); } isClosed = true; diff --git a/src/test/java/org/tikv/raw/MetricsTest.java b/src/test/java/org/tikv/raw/MetricsTest.java new file mode 100644 index 0000000000..aafcecbbf7 --- /dev/null +++ b/src/test/java/org/tikv/raw/MetricsTest.java @@ -0,0 +1,83 @@ +package org.tikv.raw; + +import static org.junit.Assert.assertEquals; + +import com.google.protobuf.ByteString; +import java.util.ArrayList; +import java.util.List; +import org.junit.After; +import org.junit.Test; +import org.tikv.common.TiConfiguration; +import org.tikv.common.TiSession; + +public class MetricsTest { + private List sessionList = new ArrayList<>(); + + @After + public void tearDown() throws Exception { + for (TiSession tiSession : sessionList) { + if (tiSession != null) { + tiSession.close(); + } + } + } + + @Test + public void oneTiSession() throws Exception { + TiConfiguration conf = TiConfiguration.createRawDefault(); + conf.setMetricsEnable(true); + TiSession session = TiSession.create(conf); + sessionList.add(session); + RawKVClient client = session.createRawClient(); + client.put(ByteString.copyFromUtf8("k"), ByteString.copyFromUtf8("v")); + ByteString result = client.get(ByteString.copyFromUtf8("k")); + assertEquals(result.toStringUtf8(), "v"); + client.close(); + session.close(); + } + + @Test + public void twoTiSession() throws Exception { + TiConfiguration conf = TiConfiguration.createRawDefault(); + conf.setMetricsEnable(true); + + TiSession session1 = TiSession.create(conf); + sessionList.add(session1); + RawKVClient client1 = session1.createRawClient(); + client1.put(ByteString.copyFromUtf8("k1"), ByteString.copyFromUtf8("v1")); + + TiSession session2 = TiSession.create(conf); + sessionList.add(session2); + RawKVClient client2 = session2.createRawClient(); + client2.put(ByteString.copyFromUtf8("k2"), ByteString.copyFromUtf8("v2")); + + client1.close(); + session1.close(); + + ByteString result = client2.get(ByteString.copyFromUtf8("k2")); + assertEquals(result.toStringUtf8(), "v2"); + + client2.close(); + session2.close(); + } + + @Test + public void twoTiSessionWithDifferentPort() { + TiConfiguration conf1 = TiConfiguration.createRawDefault(); + conf1.setMetricsEnable(true); + conf1.setMetricsPort(12345); + TiSession session1 = TiSession.create(conf1); + sessionList.add(session1); + + TiConfiguration conf2 = TiConfiguration.createRawDefault(); + conf2.setMetricsEnable(true); + conf2.setMetricsPort(54321); + try { + TiSession.create(conf2); + assertEquals(1, 2); + } catch (IllegalArgumentException e) { + assertEquals( + "Do dot support multiple tikv.metrics.port, which are 54321 and 12345", e.getMessage()); + } + } +}