diff --git a/jmx-metrics/README.md b/jmx-metrics/README.md index d3236c59..4325fdfe 100644 --- a/jmx-metrics/README.md +++ b/jmx-metrics/README.md @@ -71,6 +71,7 @@ mutually exclusive with `otel.jmx.groovy.script`. The currently supported target | [`jvm`](./docs/target-systems/jvm.md) | | [`activemq`](./docs/target-systems/activemq.md) | | [`cassandra`](./docs/target-systems/cassandra.md) | +| [`hbase`](./docs/target-systems/hbase.md) | | [`hadoop`](./docs/target-systems/hadoop.md) | | [`kafka`](./docs/target-systems/kafka.md) | | [`kafka-consumer`](./docs/target-systems/kafka-consumer.md) | @@ -78,6 +79,7 @@ mutually exclusive with `otel.jmx.groovy.script`. The currently supported target | [`solr`](./docs/target-systems/solr.md) | | [`tomcat`](./docs/target-systems/tomcat.md) | + ### JMX Query Helpers - `otel.queryJmx(String objectNameStr)` diff --git a/jmx-metrics/docs/target-systems/hbase.md b/jmx-metrics/docs/target-systems/hbase.md new file mode 100644 index 00000000..dbf4be90 --- /dev/null +++ b/jmx-metrics/docs/target-systems/hbase.md @@ -0,0 +1,359 @@ +# Hbase Metrics + +The JMX Metric Gatherer provides built in Hbase metric gathering capabilities. +These metrics are sourced from: https://hbase.apache.org/book.html#hbase_metrics + +### Metrics + +* Name: `hbase.master.region_server.count` +* Description: The number of region servers. +* Unit: `{servers}` +* Labels: `state` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.master.regions_in_transition.count` +* Description: The number of regions that are in transition. +* Unit: `{regions}` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.master.regions_in_transition.over_threshold` +* Description: The number of regions that have been in transition longer than a threshold time. +* Unit: `{regions}` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.master.regions_in_transition.oldest_age` +* Description: The age of the longest region in transition. +* Unit: `ms` +* Instrument Type: longValue + + +* Name: `hbase.region_server.region.count` +* Description: The number of regions hosted by the region server. +* Unit: `{regions}` +* Labels: `region_server` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.disk.store_file.count` +* Description: The number of store files on disk currently managed by the region server. +* Unit: `{files}` +* Labels: `region_server` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.disk.store_file.size` +* Description: Aggregate size of the store files on disk. +* Unit: `By` +* Labels: `region_server` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.write_ahead_log.count` +* Description: The number of write ahead logs not yet archived. +* Unit: `{logs}` +* Labels: `region_server` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.request.count` +* Description: The number of requests received. +* Unit: `{requests}` +* Labels: `region_server`, `state` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.queue.length` +* Description: The number of RPC handlers actively servicing requests. +* Unit: `{handlers}` +* Labels: `region_server`, `state` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.blocked_update.time` +* Description: Amount of time updates have been blocked so the memstore can be flushed. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.block_cache.operation.count` +* Description: Number of block cache hits/misses. +* Unit: `{operations}` +* Labels: `region_server`, `state` +* Instrument Type: longValue + + +* Name: `hbase.region_server.files.local` +* Description: Percent of store file data that can be read from the local. +* Unit: `%` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.append.latency.p99` +* Description: Append operation 99th Percentile latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.append.latency.max` +* Description: Append operation max latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.append.latency.min` +* Description: Append operation minimum latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.append.latency.mean` +* Description: Append operation mean latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.append.latency.median` +* Description: Append operation median latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.delete.latency.p99` +* Description: Delete operation 99th Percentile latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.delete.latency.max` +* Description: Delete operation max latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.delete.latency.min` +* Description: Delete operation minimum latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.delete.latency.mean` +* Description: Delete operation mean latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.delete.latency.median` +* Description: Delete operation median latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.put.latency.p99` +* Description: Put operation 99th Percentile latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.put.latency.max` +* Description: Put operation max latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.put.latency.min` +* Description: Put operation minimum latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.put.latency.mean` +* Description: Put operation mean latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.put.latency.median` +* Description: Put operation median latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.get.latency.p99` +* Description: Get operation 99th Percentile latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.get.latency.max` +* Description: Get operation max latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.get.latency.min` +* Description: Get operation minimum latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.get.latency.mean` +* Description: Get operation mean latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.get.latency.median` +* Description: Get operation median latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.replay.latency.p99` +* Description: Replay operation 99th Percentile latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.replay.latency.max` +* Description: Replay operation max latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.replay.latency.min` +* Description: Replay operation minimum latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.replay.latency.mean` +* Description: Replay operation mean latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.replay.latency.median` +* Description: Replay operation median latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.increment.latency.p99` +* Description: Increment operation 99th Percentile latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.increment.latency.max` +* Description: Increment operation max latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.increment.latency.min` +* Description: Increment operation minimum latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.increment.latency.mean` +* Description: Increment operation mean latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operation.increment.latency.median` +* Description: Increment operation median latency. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longValue + + +* Name: `hbase.region_server.operations.slow` +* Description: Number of operations that took over 1000ms to complete. +* Unit: `{operations}` +* Labels: `region_server`, `operation` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.open_connection.count` +* Description: The number of open connections at the RPC layer. +* Unit: `{connections}` +* Labels: `region_server` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.active_handler.count` +* Description: The number of RPC handlers actively servicing requests. +* Unit: `{handlers}` +* Labels: `region_server` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.queue.request.count` +* Description: The number of currently enqueued requests. +* Unit: `{requests}` +* Labels: `region_server`, `state` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.authentication.count` +* Description: Number of client connection authentication failures/successes. +* Unit: `{authentication requests}` +* Labels: `region_server`, `state` +* Instrument Type: longUpDownCounter + + +* Name: `hbase.region_server.gc.time` +* Description: Time spent in garbage collection. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longCounter + + +* Name: `hbase.region_server.gc.young_gen.time` +* Description: Time spent in garbage collection of the young generation. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longCounter + + +* Name: `hbase.region_server.gc.old_gen.time` +* Description: Time spent in garbage collection of the old generation. +* Unit: `ms` +* Labels: `region_server` +* Instrument Type: longCounter diff --git a/jmx-metrics/src/integrationTest/java/io/opentelemetry/contrib/jmxmetrics/target_systems/HbaseIntegrationTest.java b/jmx-metrics/src/integrationTest/java/io/opentelemetry/contrib/jmxmetrics/target_systems/HbaseIntegrationTest.java new file mode 100644 index 00000000..41280538 --- /dev/null +++ b/jmx-metrics/src/integrationTest/java/io/opentelemetry/contrib/jmxmetrics/target_systems/HbaseIntegrationTest.java @@ -0,0 +1,409 @@ +/* + * Copyright The OpenTelemetry Authors + * SPDX-License-Identifier: Apache-2.0 + */ + +package io.opentelemetry.contrib.jmxmetrics.target_systems; + +import static org.assertj.core.api.Assertions.entry; + +import io.opentelemetry.contrib.jmxmetrics.AbstractIntegrationTest; +import java.time.Duration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.Network; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.utility.MountableFile; + +class HbaseIntegrationTest extends AbstractIntegrationTest { + + HbaseIntegrationTest() { + super(/* configFromStdin= */ false, "target-systems/hbase.properties"); + } + + @Container + GenericContainer hbase = + new GenericContainer<>("dajobe/hbase") + .withNetwork(Network.SHARED) + .withEnv("LOCAL_JMX", "no") + .withCopyFileToContainer( + MountableFile.forClasspathResource("hbase/hbase-env.sh", 0400), + "/opt/hbase/conf/hbase-env.sh") + .withNetworkAliases("hbase") + .withExposedPorts(9900) + .withStartupTimeout(Duration.ofMinutes(2)) + .waitingFor(Wait.forListeningPort()); + + @Test + void endToEnd() { + waitAndAssertMetrics( + metric -> + assertSumWithAttributes( + metric, + "hbase.master.region_server.count", + "The number of region servers.", + "{servers}", + attrs -> attrs.contains(entry("state", "dead")), + attrs -> attrs.contains(entry("state", "live"))), + metric -> + assertSum( + metric, + "hbase.master.regions_in_transition.count", + "The number of regions that are in transition.", + "{regions}", + /* isMonotonic= */ false), + metric -> + assertSum( + metric, + "hbase.master.regions_in_transition.over_threshold", + "The number of regions that have been in transition longer than a threshold time.", + "{regions}", + /* isMonotonic= */ false), + metric -> + assertGauge( + metric, + "hbase.master.regions_in_transition.oldest_age", + "The age of the longest region in transition.", + "ms"), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.region.count", + "The number of regions hosted by the region server.", + "{regions}", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.disk.store_file.count", + "The number of store files on disk currently managed by the region server.", + "{files}", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.disk.store_file.size", + "Aggregate size of the store files on disk.", + "By", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.write_ahead_log.count", + "The number of write ahead logs not yet archived.", + "{logs}", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.request.count", + "The number of requests received.", + "{requests}", + attrs -> attrs.contains(entry("state", "write")), + attrs -> attrs.contains(entry("state", "read"))), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.queue.length", + "The number of RPC handlers actively servicing requests.", + "{handlers}", + attrs -> attrs.contains(entry("state", "flush")), + attrs -> attrs.contains(entry("state", "compaction"))), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.blocked_update.time", + "Amount of time updates have been blocked so the memstore can be flushed.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.request.count", + "The number of requests received.", + "{requests}", + attrs -> attrs.contains(entry("state", "write")), + attrs -> attrs.contains(entry("state", "read"))), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.files.local", + "Percent of store file data that can be read from the local.", + "%", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.p99", + "Append operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.max", + "Append operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.min", + "Append operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.mean", + "Append operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.median", + "Append operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.p99", + "Delete operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.max", + "Delete operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.min", + "Delete operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.mean", + "Delete operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.median", + "Delete operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.p99", + "Put operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.max", + "Put operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.min", + "Put operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.mean", + "Put operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.median", + "Put operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.p99", + "Get operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.max", + "Get operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.min", + "Get operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.mean", + "Get operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.median", + "Get operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.p99", + "Replay operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.max", + "Replay operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.min", + "Replay operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.mean", + "Replay operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.median", + "Replay operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.p99", + "Increment operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.max", + "Increment operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.min", + "Increment operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.mean", + "Increment operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.median", + "Increment operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.operations.slow", + "Number of operations that took over 1000ms to complete.", + "{operations}", + attrs -> attrs.contains(entry("operation", "delete")), + attrs -> attrs.contains(entry("operation", "append")), + attrs -> attrs.contains(entry("operation", "get")), + attrs -> attrs.contains(entry("operation", "put")), + attrs -> attrs.contains(entry("operation", "increment"))), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.open_connection.count", + "The number of open connections at the RPC layer.", + "{connections}", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.active_handler.count", + "The number of RPC handlers actively servicing requests.", + "{handlers}", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.queue.request.count", + "The number of currently enqueued requests.", + "{requests}", + attrs -> attrs.contains(entry("state", "replication")), + attrs -> attrs.contains(entry("state", "user")), + attrs -> attrs.contains(entry("state", "priority"))), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.authentication.count", + "Number of client connection authentication failures/successes.", + "{authentication requests}", + attrs -> attrs.contains(entry("state", "successes")), + attrs -> attrs.contains(entry("state", "failures"))), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.gc.time", + "Time spent in garbage collection.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.gc.young_gen.time", + "Time spent in garbage collection of the young generation.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.gc.old_gen.time", + "Time spent in garbage collection of the old generation.", + "ms", + attrs -> attrs.containsKey("region_server"))); + } +} diff --git a/jmx-metrics/src/integrationTest/resources/hbase/hbase-env.sh b/jmx-metrics/src/integrationTest/resources/hbase/hbase-env.sh new file mode 100644 index 00000000..72b4974d --- /dev/null +++ b/jmx-metrics/src/integrationTest/resources/hbase/hbase-env.sh @@ -0,0 +1,30 @@ +# +#/** +# * Licensed to the Apache Software Foundation (ASF) under one +# * or more contributor license agreements. See the NOTICE file +# * distributed with this work for additional information +# * regarding copyright ownership. The ASF licenses this file +# * to you under the Apache License, Version 2.0 (the +# * "License"); you may not use this file except in compliance +# * with the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ + +# Set environment variables here. + +# This script sets variables multiple times over the course of starting an hbase process, +# so try to keep things idempotent unless you want to take an even deeper look +# into the startup scripts (bin/hbase, etc.) + +# The java implementation to use. Java 1.8+ required. +export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +export HBASE_OPTS="$HBASE_OPTS -XX:+UseConcMarkSweepGC" +export HBASE_JMX_BASE="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false" +export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS $HBASE_JMX_BASE -Dcom.sun.management.jmxremote.rmi.port=9900 -Dcom.sun.management.jmxremote.port=9900" \ No newline at end of file diff --git a/jmx-metrics/src/integrationTest/resources/target-systems/hbase.properties b/jmx-metrics/src/integrationTest/resources/target-systems/hbase.properties new file mode 100644 index 00000000..19965aa8 --- /dev/null +++ b/jmx-metrics/src/integrationTest/resources/target-systems/hbase.properties @@ -0,0 +1,7 @@ +otel.jmx.interval.milliseconds = 3000 +otel.metrics.exporter = otlp +otel.jmx.service.url = service:jmx:rmi:///jndi/rmi://hbase:9900/jmxrmi +otel.jmx.target.system = hbase + +# these will be overridden by cmd line +otel.exporter.otlp.endpoint = http://host.testcontainers.internal diff --git a/jmx-metrics/src/main/groovy/io/opentelemetry/contrib/jmxmetrics/JmxConfig.java b/jmx-metrics/src/main/groovy/io/opentelemetry/contrib/jmxmetrics/JmxConfig.java index f2114b4f..bb809a51 100644 --- a/jmx-metrics/src/main/groovy/io/opentelemetry/contrib/jmxmetrics/JmxConfig.java +++ b/jmx-metrics/src/main/groovy/io/opentelemetry/contrib/jmxmetrics/JmxConfig.java @@ -36,6 +36,7 @@ class JmxConfig { Arrays.asList( "activemq", "cassandra", + "hbase", "hadoop", "jvm", "kafka", diff --git a/jmx-metrics/src/main/resources/target-systems/hbase.groovy b/jmx-metrics/src/main/resources/target-systems/hbase.groovy new file mode 100644 index 00000000..7c03f75a --- /dev/null +++ b/jmx-metrics/src/main/resources/target-systems/hbase.groovy @@ -0,0 +1,249 @@ +/* + * Copyright The OpenTelemetry Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +def beanMasterServer = otel.mbeans("Hadoop:service=HBase,name=Master,sub=Server") +otel.instrument(beanMasterServer, "hbase.master.region_server.count", + "The number of region servers.", "{servers}", + ["numDeadRegionServers":["state" : {"dead"}], "numRegionServers": ["state" : {"live"}]], + otel.&longUpDownCounterCallback) + +def beanMasterAssignmentManager = otel.mbean("Hadoop:service=HBase,name=Master,sub=AssignmentManager") +otel.instrument(beanMasterAssignmentManager, "hbase.master.regions_in_transition.count", + "The number of regions that are in transition.", "{regions}", + "ritCount", otel.&longUpDownCounterCallback) +otel.instrument(beanMasterAssignmentManager, "hbase.master.regions_in_transition.over_threshold", + "The number of regions that have been in transition longer than a threshold time.", "{regions}", + "ritCountOverThreshold", otel.&longUpDownCounterCallback) +otel.instrument(beanMasterAssignmentManager, "hbase.master.regions_in_transition.oldest_age", + "The age of the longest region in transition.", "ms", + "ritOldestAge", otel.&longValueCallback) + +def beanRegionServerServer = otel.mbean("Hadoop:service=HBase,name=RegionServer,sub=Server") +otel.instrument(beanRegionServerServer, "hbase.region_server.region.count", + "The number of regions hosted by the region server.", "{regions}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "regionCount", otel.&longUpDownCounterCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.disk.store_file.count", + "The number of store files on disk currently managed by the region server.", "{files}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "storeFileCount", otel.&longUpDownCounterCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.disk.store_file.size", + "Aggregate size of the store files on disk.", "By", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "storeFileSize", otel.&longUpDownCounterCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.write_ahead_log.count", + "The number of write ahead logs not yet archived.", "{logs}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "hlogFileCount", otel.&longUpDownCounterCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.request.count", + "The number of requests received.", "{requests}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + ["writeRequestCount":["state" : {"write"}], "readRequestCount": ["state" : {"read"}]], + otel.&longUpDownCounterCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.queue.length", + "The number of RPC handlers actively servicing requests.", "{handlers}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + ["flushQueueLength":["state" : {"flush"}], "compactionQueueLength": ["state" : {"compaction"}]], + otel.&longUpDownCounterCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.blocked_update.time", + "Amount of time updates have been blocked so the memstore can be flushed.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "updatesBlockedTime", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.block_cache.operation.count", + "Number of block cache hits/misses.", "{operations}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + ["blockCacheMissCount":["state" : {"miss"}], "blockCacheHitCount": ["state" : {"hit"}]], + otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.files.local", + "Percent of store file data that can be read from the local.", "%", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "percentFilesLocal", otel.&doubleValueCallback) + +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.append.latency.p99", + "Append operation 99th Percentile latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Append_99th_percentile", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.append.latency.max", + "Append operation max latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Append_max", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.append.latency.min", + "Append operation minimum latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Append_min", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.append.latency.mean", + "Append operation mean latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Append_mean", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.append.latency.median", + "Append operation median latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Append_median", otel.&longValueCallback) + +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.delete.latency.p99", + "Delete operation 99th Percentile latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Delete_99th_percentile", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.delete.latency.max", + "Delete operation max latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Delete_max", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.delete.latency.min", + "Delete operation minimum latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Delete_min", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.delete.latency.mean", + "Delete operation mean latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Delete_mean", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.delete.latency.median", + "Delete operation median latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Delete_median", otel.&longValueCallback) + +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.put.latency.p99", + "Put operation 99th Percentile latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Put_99th_percentile", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.put.latency.max", + "Put operation max latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Put_max", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.put.latency.min", + "Put operation minimum latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Put_min", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.put.latency.mean", + "Put operation mean latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Put_mean", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.put.latency.median", + "Put operation median latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Put_median", otel.&longValueCallback) + +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.get.latency.p99", + "Get operation 99th Percentile latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Get_99th_percentile", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.get.latency.max", + "Get operation max latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Get_max", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.get.latency.min", + "Get operation minimum latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Get_min", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.get.latency.mean", + "Get operation mean latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Get_mean", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.get.latency.median", + "Get operation median latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Get_median", otel.&longValueCallback) + +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.replay.latency.p99", + "Replay operation 99th Percentile latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Replay_99th_percentile", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.replay.latency.max", + "Replay operation max latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Replay_max", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.replay.latency.min", + "Replay operation minimum latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Replay_min", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.replay.latency.mean", + "Replay operation mean latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Replay_mean", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.replay.latency.median", + "Replay operation median latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Replay_median", otel.&longValueCallback) + +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.increment.latency.p99", + "Increment operation 99th Percentile latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Increment_99th_percentile", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.increment.latency.max", + "Increment operation max latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Increment_max", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.increment.latency.min", + "Increment operation minimum latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Increment_min", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.increment.latency.mean", + "Increment operation mean latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Increment_mean", otel.&longValueCallback) +otel.instrument(beanRegionServerServer, "hbase.region_server.operation.increment.latency.median", + "Increment operation median latency.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "Increment_median", otel.&longValueCallback) + +otel.instrument(beanRegionServerServer, "hbase.region_server.operations.slow", + "Number of operations that took over 1000ms to complete.", "{operations}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + [ + "slowDeleteCount":["operation" : {"delete"}], + "slowAppendCount": ["operation" : {"append"}], + "slowGetCount": ["operation" : {"get"}], + "slowPutCount": ["operation" : {"put"}], + "slowIncrementCount": ["operation" : {"increment"}] + ], + otel.&longUpDownCounterCallback) + +def beanRegionServerIPC = otel.mbean("Hadoop:service=HBase,name=RegionServer,sub=IPC") +otel.instrument(beanRegionServerIPC, "hbase.region_server.open_connection.count", + "The number of open connections at the RPC layer.", "{connections}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "numOpenConnections", otel.&longUpDownCounterCallback) +otel.instrument(beanRegionServerIPC, "hbase.region_server.active_handler.count", + "The number of RPC handlers actively servicing requests.", "{handlers}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "numActiveHandler", otel.&longUpDownCounterCallback) +otel.instrument(beanRegionServerIPC, "hbase.region_server.queue.request.count", + "The number of currently enqueued requests.", "{requests}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + [ + "numCallsInReplicationQueue":["state" : {"replication"}], + "numCallsInGeneralQueue": ["state" : {"user"}], + "numCallsInPriorityQueue": ["state" : {"priority"}] + ], + otel.&longUpDownCounterCallback) +otel.instrument(beanRegionServerIPC, "hbase.region_server.authentication.count", + "Number of client connection authentication failures/successes.", "{authentication requests}", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + ["authenticationSuccesses":["state" : {"successes"}], "authenticationFailures": ["state" : {"failures"}]], + otel.&longUpDownCounterCallback) + +def beanJVMMetrics = otel.mbean("Hadoop:service=HBase,name=JvmMetrics") +otel.instrument(beanJVMMetrics, "hbase.region_server.gc.time", + "Time spent in garbage collection.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "GcTimeMillis", otel.&longCounterCallback) +otel.instrument(beanJVMMetrics, "hbase.region_server.gc.young_gen.time", + "Time spent in garbage collection of the young generation.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "GcTimeMillisParNew", otel.&longCounterCallback) +otel.instrument(beanJVMMetrics, "hbase.region_server.gc.old_gen.time", + "Time spent in garbage collection of the old generation.", "ms", + ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], + "GcTimeMillisConcurrentMarkSweep", otel.&longCounterCallback) \ No newline at end of file diff --git a/jmx-metrics/src/test/java/io/opentelemetry/contrib/jmxmetrics/JmxConfigTest.java b/jmx-metrics/src/test/java/io/opentelemetry/contrib/jmxmetrics/JmxConfigTest.java index aa5201d6..c7b9587f 100644 --- a/jmx-metrics/src/test/java/io/opentelemetry/contrib/jmxmetrics/JmxConfigTest.java +++ b/jmx-metrics/src/test/java/io/opentelemetry/contrib/jmxmetrics/JmxConfigTest.java @@ -20,6 +20,7 @@ class JmxConfigTest { .containsOnly( "activemq", "cassandra", + "hbase", "hadoop", "jvm", "kafka", @@ -123,7 +124,7 @@ class JmxConfigTest { assertThatThrownBy(config::validate) .isInstanceOf(ConfigurationException.class) .hasMessage( - "[jvm, unavailabletargetsystem] must specify targets from [activemq, cassandra, hadoop, jvm, " + "[jvm, unavailabletargetsystem] must specify targets from [activemq, cassandra, hbase, hadoop, jvm, " + "kafka, kafka-consumer, kafka-producer, solr, tomcat]"); }