diff --git a/services/src/main/java/io/grpc/services/CallMetricRecorder.java b/services/src/main/java/io/grpc/services/CallMetricRecorder.java index 995a17a89a..8570a989f2 100644 --- a/services/src/main/java/io/grpc/services/CallMetricRecorder.java +++ b/services/src/main/java/io/grpc/services/CallMetricRecorder.java @@ -135,7 +135,7 @@ public final class CallMetricRecorder { * @since 1.47.0 */ public CallMetricRecorder recordCpuUtilizationMetric(double value) { - if (disabled || !MetricRecorderHelper.isUtilizationValid(value)) { + if (disabled || !MetricRecorderHelper.isCpuUtilizationValid(value)) { return this; } cpuUtilizationMetric = value; diff --git a/services/src/main/java/io/grpc/services/MetricRecorder.java b/services/src/main/java/io/grpc/services/MetricRecorder.java index 5e6376d87f..3027d49546 100644 --- a/services/src/main/java/io/grpc/services/MetricRecorder.java +++ b/services/src/main/java/io/grpc/services/MetricRecorder.java @@ -64,11 +64,11 @@ public final class MetricRecorder { } /** - * Update the CPU utilization metrics data in the range [0, 1]. Values outside the valid range are - * ignored. + * Update the CPU utilization metrics data in the range [0, inf). Values outside the valid range + * are ignored. */ public void setCpuUtilizationMetric(double value) { - if (!MetricRecorderHelper.isUtilizationValid(value)) { + if (!MetricRecorderHelper.isCpuUtilizationValid(value)) { return; } cpuUtilization = value; diff --git a/services/src/main/java/io/grpc/services/MetricRecorderHelper.java b/services/src/main/java/io/grpc/services/MetricRecorderHelper.java index 5cd917f5e0..94a811f4f1 100644 --- a/services/src/main/java/io/grpc/services/MetricRecorderHelper.java +++ b/services/src/main/java/io/grpc/services/MetricRecorderHelper.java @@ -29,6 +29,15 @@ final class MetricRecorderHelper { return utilization >= 0.0 && utilization <= 1.0; } + /** + * Return true if the cpu utilization value is in the range [0, inf) and false otherwise. + * Occasionally users have over 100% cpu utilization and get a runaway effect where the backend + * with highest qps gets more and more qps sent to it. So we allow cpu utilization > 1.0. + */ + static boolean isCpuUtilizationValid(double utilization) { + return utilization >= 0.0; + } + /** * Return true if the qps value is in the range [0, inf) and false otherwise. */ diff --git a/services/src/test/java/io/grpc/services/CallMetricRecorderTest.java b/services/src/test/java/io/grpc/services/CallMetricRecorderTest.java index 89b07e7208..03f29a05ef 100644 --- a/services/src/test/java/io/grpc/services/CallMetricRecorderTest.java +++ b/services/src/test/java/io/grpc/services/CallMetricRecorderTest.java @@ -72,7 +72,7 @@ public class CallMetricRecorderTest { recorder.recordUtilizationMetric("util1", 1.001); MetricReport dump = recorder.finalizeAndDump2(); - Truth.assertThat(dump.getCpuUtilization()).isEqualTo(0); + Truth.assertThat(dump.getCpuUtilization()).isEqualTo(1.001); Truth.assertThat(dump.getMemoryUtilization()).isEqualTo(0); Truth.assertThat(dump.getQps()).isEqualTo(0); Truth.assertThat(dump.getUtilizationMetrics()).isEmpty(); diff --git a/xds/src/test/java/io/grpc/xds/orca/OrcaServiceImplTest.java b/xds/src/test/java/io/grpc/xds/orca/OrcaServiceImplTest.java index 4afa731bdc..01e9c906bf 100644 --- a/xds/src/test/java/io/grpc/xds/orca/OrcaServiceImplTest.java +++ b/xds/src/test/java/io/grpc/xds/orca/OrcaServiceImplTest.java @@ -245,7 +245,7 @@ public class OrcaServiceImplTest { public void testApis() throws Exception { ImmutableMap firstUtilization = ImmutableMap.of("util", 0.1); OrcaLoadReport goldenReport = OrcaLoadReport.newBuilder() - .setCpuUtilization(random.nextDouble()) + .setCpuUtilization(random.nextDouble() * 10) .setMemUtilization(random.nextDouble()) .putAllUtilization(firstUtilization) .putUtilization("queue", 1.0) @@ -276,7 +276,6 @@ public class OrcaServiceImplTest { assertThat(reports.next()).isEqualTo(goldenReport); defaultTestService.setCpuUtilizationMetric(-0.001); - defaultTestService.setCpuUtilizationMetric(1.001); defaultTestService.setMemoryUtilizationMetric(-0.001); defaultTestService.setMemoryUtilizationMetric(1.001); defaultTestService.setQpsMetric(-0.001);