rls: Reduce RLS channel logging

The channel log is shared by many components and is poorly suited to
the noise of per-RPC events. This commit restricts RLS usage of the
logger to no more frequent than cache entry events. This may still be
too frequent, but should substantially improve the signal-to-noise and
we can do further rework as needed.

Many of the log entries were poor because they lacked enough context.
They weren't even clear they were from RLS. The cache entry events now
regularly include the request key in the logs, allowing you to follow
events for specific keys. I would have preferred using the hash code,
but NumberFormat is annoying and toString() may be acceptable given its
convenience.

This commit reverts much of eba699ad. Those logs have not proven to be
helpful as they produce more output than can be reasonably stored.
This commit is contained in:
Eric Anderson 2024-11-27 11:37:45 -08:00 committed by GitHub
parent ebb43a69e7
commit 7f9c1f39f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 19 additions and 50 deletions

View File

@ -278,9 +278,8 @@ final class CachingRlsLbClient {
@GuardedBy("lock")
private CachedRouteLookupResponse asyncRlsCall(
RouteLookupRequest request, @Nullable BackoffPolicy backoffPolicy) {
logger.log(ChannelLogLevel.DEBUG, "Making an async call to RLS");
if (throttler.shouldThrottle()) {
logger.log(ChannelLogLevel.DEBUG, "Request is throttled");
logger.log(ChannelLogLevel.DEBUG, "[RLS Entry {0}] Throttled RouteLookup", request);
// Cache updated, but no need to call updateBalancingState because no RPCs were queued waiting
// on this result
return CachedRouteLookupResponse.backoffEntry(createBackOffEntry(
@ -288,27 +287,29 @@ final class CachingRlsLbClient {
}
final SettableFuture<RouteLookupResponse> response = SettableFuture.create();
io.grpc.lookup.v1.RouteLookupRequest routeLookupRequest = REQUEST_CONVERTER.convert(request);
logger.log(ChannelLogLevel.DEBUG, "Sending RouteLookupRequest: {0}", routeLookupRequest);
logger.log(ChannelLogLevel.DEBUG,
"[RLS Entry {0}] Starting RouteLookup: {1}", request, routeLookupRequest);
rlsStub.withDeadlineAfter(callTimeoutNanos, TimeUnit.NANOSECONDS)
.routeLookup(
routeLookupRequest,
new StreamObserver<io.grpc.lookup.v1.RouteLookupResponse>() {
@Override
public void onNext(io.grpc.lookup.v1.RouteLookupResponse value) {
logger.log(ChannelLogLevel.DEBUG, "Received RouteLookupResponse: {0}", value);
logger.log(ChannelLogLevel.DEBUG,
"[RLS Entry {0}] RouteLookup succeeded: {1}", request, value);
response.set(RESPONSE_CONVERTER.reverse().convert(value));
}
@Override
public void onError(Throwable t) {
logger.log(ChannelLogLevel.DEBUG, "Error looking up route:", t);
logger.log(ChannelLogLevel.DEBUG,
"[RLS Entry {0}] RouteLookup failed: {1}", request, t);
response.setException(t);
throttler.registerBackendResponse(true);
}
@Override
public void onCompleted() {
logger.log(ChannelLogLevel.DEBUG, "routeLookup call completed");
throttler.registerBackendResponse(false);
}
});
@ -323,13 +324,10 @@ final class CachingRlsLbClient {
*/
@CheckReturnValue
final CachedRouteLookupResponse get(final RouteLookupRequest request) {
logger.log(ChannelLogLevel.DEBUG, "Acquiring lock to get cached entry");
synchronized (lock) {
logger.log(ChannelLogLevel.DEBUG, "Acquired lock to get cached entry");
final CacheEntry cacheEntry;
cacheEntry = linkedHashLruCache.read(request);
if (cacheEntry == null) {
logger.log(ChannelLogLevel.DEBUG, "No cache entry found, making a new RLS request");
PendingCacheEntry pendingEntry = pendingCallCache.get(request);
if (pendingEntry != null) {
return CachedRouteLookupResponse.pendingResponse(pendingEntry);
@ -339,15 +337,12 @@ final class CachingRlsLbClient {
if (cacheEntry instanceof DataCacheEntry) {
// cache hit, initiate async-refresh if entry is staled
logger.log(ChannelLogLevel.DEBUG, "Cache hit for the request");
DataCacheEntry dataEntry = ((DataCacheEntry) cacheEntry);
if (dataEntry.isStaled(ticker.read())) {
logger.log(ChannelLogLevel.DEBUG, "Cache entry is stale");
dataEntry.maybeRefresh();
}
return CachedRouteLookupResponse.dataEntry((DataCacheEntry) cacheEntry);
}
logger.log(ChannelLogLevel.DEBUG, "Cache hit for a backup entry");
return CachedRouteLookupResponse.backoffEntry((BackoffCacheEntry) cacheEntry);
}
}
@ -409,8 +404,8 @@ final class CachingRlsLbClient {
RouteLookupRequest request, RouteLookupResponse routeLookupResponse) {
logger.log(
ChannelLogLevel.DEBUG,
"Transition to data cache: routeLookupResponse={0}",
routeLookupResponse);
"[RLS Entry {0}] Transition to data cache: routeLookupResponse={1}",
request, routeLookupResponse);
DataCacheEntry entry = new DataCacheEntry(request, routeLookupResponse);
// Constructor for DataCacheEntry causes updateBalancingState, but the picks can't happen until
// this cache update because the lock is held
@ -421,18 +416,19 @@ final class CachingRlsLbClient {
@GuardedBy("lock")
private BackoffCacheEntry createBackOffEntry(
RouteLookupRequest request, Status status, @Nullable BackoffPolicy backoffPolicy) {
logger.log(ChannelLogLevel.DEBUG, "Transition to back off: status={0}", status);
if (backoffPolicy == null) {
backoffPolicy = backoffProvider.get();
}
long delayNanos = backoffPolicy.nextBackoffNanos();
logger.log(
ChannelLogLevel.DEBUG,
"[RLS Entry {0}] Transition to back off: status={1}, delayNanos={2}",
request, status, delayNanos);
BackoffCacheEntry entry = new BackoffCacheEntry(request, status, backoffPolicy);
// Lock is held, so the task can't execute before the assignment
entry.scheduledFuture = scheduledExecutorService.schedule(
() -> refreshBackoffEntry(entry), delayNanos, TimeUnit.NANOSECONDS);
linkedHashLruCache.cacheAndClean(request, entry);
logger.log(ChannelLogLevel.DEBUG, "BackoffCacheEntry created with a delay of {0} nanos",
delayNanos);
return entry;
}
@ -443,7 +439,8 @@ final class CachingRlsLbClient {
// Future was previously cancelled
return;
}
logger.log(ChannelLogLevel.DEBUG, "Calling RLS for transition to pending");
logger.log(ChannelLogLevel.DEBUG,
"[RLS Entry {0}] Calling RLS for transition to pending", entry.request);
linkedHashLruCache.invalidate(entry.request);
asyncRlsCall(entry.request, entry.backoffPolicy);
}
@ -659,10 +656,10 @@ final class CachingRlsLbClient {
synchronized (lock) { // Lock is already held, but ErrorProne can't tell
if (pendingCallCache.containsKey(request)) {
// pending already requested
logger.log(ChannelLogLevel.DEBUG,
"A pending refresh request already created, no need to proceed with refresh");
return;
}
logger.log(ChannelLogLevel.DEBUG,
"[RLS Entry {0}] Cache entry is stale, refreshing", request);
asyncRlsCall(request, /* backoffPolicy= */ null);
}
}
@ -943,13 +940,10 @@ final class CachingRlsLbClient {
@Override
public void onStatusChanged(ConnectivityState newState) {
logger.log(ChannelLogLevel.DEBUG, "LB status changed to: {0}", newState);
if (prevState == ConnectivityState.TRANSIENT_FAILURE
&& newState == ConnectivityState.READY) {
logger.log(ChannelLogLevel.DEBUG, "Transitioning from TRANSIENT_FAILURE to READY");
logger.log(ChannelLogLevel.DEBUG, "Acquiring lock force refresh backoff cache entries");
synchronized (lock) {
logger.log(ChannelLogLevel.DEBUG, "Lock acquired for refreshing backoff cache entries");
for (CacheEntry value : linkedHashLruCache.values()) {
if (value instanceof BackoffCacheEntry) {
refreshBackoffEntry((BackoffCacheEntry) value);
@ -983,31 +977,22 @@ final class CachingRlsLbClient {
RouteLookupRequest request =
requestFactory.create(serviceName, methodName, args.getHeaders());
final CachedRouteLookupResponse response = CachingRlsLbClient.this.get(request);
logger.log(ChannelLogLevel.DEBUG,
"Got route lookup cache entry for service={0}, method={1}, headers={2}:\n {3}",
new Object[]{serviceName, methodName, args.getHeaders(), response});
if (response.getHeaderData() != null && !response.getHeaderData().isEmpty()) {
logger.log(ChannelLogLevel.DEBUG, "Updating RLS metadata from the RLS response headers");
Metadata headers = args.getHeaders();
headers.discardAll(RLS_DATA_KEY);
headers.put(RLS_DATA_KEY, response.getHeaderData());
}
String defaultTarget = lbPolicyConfig.getRouteLookupConfig().defaultTarget();
logger.log(ChannelLogLevel.DEBUG, "defaultTarget = {0}", defaultTarget);
boolean hasFallback = defaultTarget != null && !defaultTarget.isEmpty();
if (response.hasData()) {
logger.log(ChannelLogLevel.DEBUG, "RLS response has data, proceed with selecting a picker");
ChildPolicyWrapper childPolicyWrapper = response.getChildPolicyWrapper();
SubchannelPicker picker =
(childPolicyWrapper != null) ? childPolicyWrapper.getPicker() : null;
if (picker == null) {
logger.log(ChannelLogLevel.DEBUG,
"Child policy wrapper didn't return a picker, returning PickResult with no results");
return PickResult.withNoResult();
}
// Happy path
logger.log(ChannelLogLevel.DEBUG, "Returning PickResult");
PickResult pickResult = picker.pickSubchannel(args);
if (pickResult.hasResult()) {
helper.getMetricRecorder().addLongCounter(TARGET_PICKS_COUNTER, 1,
@ -1017,20 +1002,15 @@ final class CachingRlsLbClient {
}
return pickResult;
} else if (response.hasError()) {
logger.log(ChannelLogLevel.DEBUG, "RLS response has errors");
if (hasFallback) {
logger.log(ChannelLogLevel.DEBUG, "Using RLS fallback");
return useFallback(args);
}
logger.log(ChannelLogLevel.DEBUG, "No RLS fallback, returning PickResult with an error");
helper.getMetricRecorder().addLongCounter(FAILED_PICKS_COUNTER, 1,
Arrays.asList(helper.getChannelTarget(), lookupService), Collections.emptyList());
return PickResult.withError(
convertRlsServerStatus(response.getStatus(),
lbPolicyConfig.getRouteLookupConfig().lookupService()));
} else {
logger.log(ChannelLogLevel.DEBUG,
"RLS response had no data, return a PickResult with no data");
return PickResult.withNoResult();
}
}
@ -1067,13 +1047,11 @@ final class CachingRlsLbClient {
private void startFallbackChildPolicy() {
String defaultTarget = lbPolicyConfig.getRouteLookupConfig().defaultTarget();
logger.log(ChannelLogLevel.DEBUG, "starting fallback to {0}", defaultTarget);
logger.log(ChannelLogLevel.DEBUG, "Acquiring lock to start fallback child policy");
synchronized (lock) {
logger.log(ChannelLogLevel.DEBUG, "Acquired lock for starting fallback child policy");
if (fallbackChildPolicyWrapper != null) {
return;
}
logger.log(ChannelLogLevel.DEBUG, "starting fallback to {0}", defaultTarget);
fallbackChildPolicyWrapper = refCountedChildPolicyWrapperFactory.createOrGet(defaultTarget);
}
}
@ -1081,7 +1059,6 @@ final class CachingRlsLbClient {
// GuardedBy CachingRlsLbClient.lock
void close() {
synchronized (lock) { // Lock is already held, but ErrorProne can't tell
logger.log(ChannelLogLevel.DEBUG, "Closing RLS picker");
if (fallbackChildPolicyWrapper != null) {
refCountedChildPolicyWrapperFactory.release(fallbackChildPolicyWrapper);
}

View File

@ -50,12 +50,11 @@ final class RlsLoadBalancer extends LoadBalancer {
@Override
public Status acceptResolvedAddresses(ResolvedAddresses resolvedAddresses) {
logger.log(ChannelLogLevel.DEBUG, "Received resolution result: {0}", resolvedAddresses);
LbPolicyConfiguration lbPolicyConfiguration =
(LbPolicyConfiguration) resolvedAddresses.getLoadBalancingPolicyConfig();
checkNotNull(lbPolicyConfiguration, "Missing RLS LB config");
if (!lbPolicyConfiguration.equals(this.lbPolicyConfiguration)) {
logger.log(ChannelLogLevel.DEBUG, "A new RLS LB config received");
logger.log(ChannelLogLevel.DEBUG, "A new RLS LB config received: {0}", lbPolicyConfiguration);
boolean needToConnect = this.lbPolicyConfiguration == null
|| !this.lbPolicyConfiguration.getRouteLookupConfig().lookupService().equals(
lbPolicyConfiguration.getRouteLookupConfig().lookupService());
@ -80,22 +79,18 @@ final class RlsLoadBalancer extends LoadBalancer {
// not required.
this.lbPolicyConfiguration = lbPolicyConfiguration;
}
logger.log(ChannelLogLevel.DEBUG, "RLS LB accepted resolved addresses successfully");
return Status.OK;
}
@Override
public void requestConnection() {
logger.log(ChannelLogLevel.DEBUG, "connection requested from RLS LB");
if (routeLookupClient != null) {
logger.log(ChannelLogLevel.DEBUG, "requesting a connection from the routeLookupClient");
routeLookupClient.requestConnection();
}
}
@Override
public void handleNameResolutionError(final Status error) {
logger.log(ChannelLogLevel.DEBUG, "Received resolution error: {0}", error);
class ErrorPicker extends SubchannelPicker {
@Override
public PickResult pickSubchannel(PickSubchannelArgs args) {
@ -116,14 +111,11 @@ final class RlsLoadBalancer extends LoadBalancer {
routeLookupClient = null;
lbPolicyConfiguration = null;
}
logger.log(ChannelLogLevel.DEBUG,
"Updating balancing state to TRANSIENT_FAILURE with an error picker");
helper.updateBalancingState(ConnectivityState.TRANSIENT_FAILURE, new ErrorPicker());
}
@Override
public void shutdown() {
logger.log(ChannelLogLevel.DEBUG, "Rls lb shutdown");
if (routeLookupClient != null) {
logger.log(ChannelLogLevel.DEBUG, "closing the routeLookupClient because of RLS LB shutdown");
routeLookupClient.close();