-
Notifications
You must be signed in to change notification settings - Fork 1.1k
chore(spanner): add LatencyTracker interface and default implementation #12729
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| /* | ||
| * Copyright 2026 Google LLC | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package com.google.cloud.spanner.spi.v1; | ||
|
|
||
| import com.google.api.core.BetaApi; | ||
| import com.google.api.core.InternalApi; | ||
| import com.google.common.base.Preconditions; | ||
| import java.time.Duration; | ||
| import java.util.concurrent.TimeUnit; | ||
| import javax.annotation.concurrent.GuardedBy; | ||
|
|
||
| /** | ||
| * Implementation of {@link LatencyTracker} using Exponentially Weighted Moving Average (EWMA). | ||
| * | ||
| * <p>Formula: $S_{i+1} = \alpha * new\_latency + (1 - \alpha) * S_i$ | ||
| * | ||
| * <p>This class is thread-safe. | ||
| */ | ||
| @InternalApi | ||
| @BetaApi | ||
| public class EwmaLatencyTracker implements LatencyTracker { | ||
|
|
||
| public static final double DEFAULT_ALPHA = 0.05; | ||
|
|
||
| private final double alpha; | ||
| private final Object lock = new Object(); | ||
|
|
||
| @GuardedBy("lock") | ||
| private double score; | ||
|
|
||
| @GuardedBy("lock") | ||
| private boolean initialized = false; | ||
|
|
||
| /** Creates a new tracker with the default alpha value of 0.05. */ | ||
| public EwmaLatencyTracker() { | ||
| this(DEFAULT_ALPHA); | ||
| } | ||
|
|
||
| /** | ||
| * Creates a new tracker with the specified alpha value. | ||
| * | ||
| * @param alpha the smoothing factor, must be in the range (0, 1] | ||
| */ | ||
| public EwmaLatencyTracker(double alpha) { | ||
| Preconditions.checkArgument(alpha > 0.0 && alpha <= 1.0, "alpha must be in (0, 1]"); | ||
| this.alpha = alpha; | ||
| } | ||
|
|
||
| @Override | ||
| public double getScore() { | ||
| synchronized (lock) { | ||
| return initialized ? score : Double.MAX_VALUE; | ||
| } | ||
|
olavloite marked this conversation as resolved.
|
||
| } | ||
|
|
||
| @Override | ||
| public void update(Duration latency) { | ||
| long latencyMicros; | ||
| try { | ||
| latencyMicros = TimeUnit.MICROSECONDS.convert(latency.toNanos(), TimeUnit.NANOSECONDS); | ||
| } catch (ArithmeticException e) { | ||
| // Duration is too large to fit in nanoseconds (292+ years). | ||
| // Use Long.MAX_VALUE to give it the lowest possible priority. | ||
| latencyMicros = Long.MAX_VALUE; | ||
| } | ||
| synchronized (lock) { | ||
| if (!initialized) { | ||
| score = latencyMicros; | ||
| initialized = true; | ||
| } else { | ||
| score = alpha * latencyMicros + (1 - alpha) * score; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void recordError(Duration penalty) { | ||
| // Treat the error as a sample with high latency (penalty) | ||
| update(penalty); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| /* | ||
| * Copyright 2026 Google LLC | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package com.google.cloud.spanner.spi.v1; | ||
|
|
||
| import com.google.api.core.BetaApi; | ||
| import com.google.api.core.InternalApi; | ||
| import java.time.Duration; | ||
|
|
||
| /** | ||
| * Interface for tracking latency scores of Spanner servers. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: The abstraction is attached to the wrong identity unless you are very careful in the follow-up work. The doc wants a score “for a given spanner server”, but this branch introduces a generic tracker with no ownership model. In the current routing code, CachedTablet instances are reused across cache updates and can change serverAddress in place. If someone later stores the EWMA on CachedTablet, the latency history from the old server will bleed into the new one after a cache update. The stable identity in this codebase is the per-address endpoint cached, not the tablet object.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ack |
||
| * | ||
| * <p>Implementations must be thread-safe as instances may be shared across multiple concurrent | ||
| * operations. | ||
| */ | ||
| @InternalApi | ||
| @BetaApi | ||
| public interface LatencyTracker { | ||
|
|
||
| /** | ||
| * Returns the current latency score. | ||
| * | ||
| * @return the latency score, where lower is better. | ||
| */ | ||
| double getScore(); | ||
|
|
||
| /** | ||
| * Updates the latency score with a new observation. | ||
| * | ||
| * @param latency the observed latency. | ||
| */ | ||
| void update(Duration latency); | ||
|
|
||
| /** | ||
| * Records an error and applies a latency penalty. | ||
| * | ||
| * @param penalty the penalty to apply. | ||
| */ | ||
| void recordError(Duration penalty); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| /* | ||
| * Copyright 2026 Google LLC | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package com.google.cloud.spanner.spi.v1; | ||
|
|
||
| import static org.junit.Assert.assertEquals; | ||
| import static org.junit.Assert.assertThrows; | ||
|
|
||
| import java.time.Duration; | ||
| import org.junit.Test; | ||
| import org.junit.runner.RunWith; | ||
| import org.junit.runners.JUnit4; | ||
|
|
||
| @RunWith(JUnit4.class) | ||
| public class EwmaLatencyTrackerTest { | ||
|
|
||
| @Test | ||
| public void testInitialization() { | ||
| EwmaLatencyTracker tracker = new EwmaLatencyTracker(); | ||
| tracker.update(Duration.ofNanos(100 * 1000)); | ||
| assertEquals(100.0, tracker.getScore(), 0.001); | ||
| } | ||
|
|
||
|
olavloite marked this conversation as resolved.
|
||
| @Test | ||
| public void testUninitializedScore() { | ||
| EwmaLatencyTracker tracker = new EwmaLatencyTracker(); | ||
| assertEquals(Double.MAX_VALUE, tracker.getScore(), 0.001); | ||
| } | ||
|
|
||
| @Test | ||
| public void testOverflowScore() { | ||
| EwmaLatencyTracker tracker = new EwmaLatencyTracker(); | ||
| tracker.update(Duration.ofSeconds(Long.MAX_VALUE)); | ||
| assertEquals((double) Long.MAX_VALUE, tracker.getScore(), 0.001); | ||
| } | ||
|
|
||
| @Test | ||
| public void testEwmaCalculation() { | ||
| double alpha = 0.5; | ||
| EwmaLatencyTracker tracker = new EwmaLatencyTracker(alpha); | ||
|
|
||
| tracker.update(Duration.ofNanos(100 * 1000)); // Initial score = 100 | ||
| assertEquals(100.0, tracker.getScore(), 0.001); | ||
|
|
||
| tracker.update(Duration.ofNanos(200 * 1000)); // Score = 0.5 * 200 + 0.5 * 100 = 150 | ||
| assertEquals(150.0, tracker.getScore(), 0.001); | ||
|
|
||
| tracker.update(Duration.ofNanos(300 * 1000)); // Score = 0.5 * 300 + 0.5 * 150 = 225 | ||
| assertEquals(225.0, tracker.getScore(), 0.001); | ||
| } | ||
|
|
||
| @Test | ||
| public void testDefaultAlpha() { | ||
| EwmaLatencyTracker tracker = new EwmaLatencyTracker(); | ||
| tracker.update(Duration.ofNanos(100 * 1000)); | ||
| tracker.update(Duration.ofNanos(200 * 1000)); | ||
|
|
||
| double expected = | ||
| EwmaLatencyTracker.DEFAULT_ALPHA * 200 + (1 - EwmaLatencyTracker.DEFAULT_ALPHA) * 100; | ||
| assertEquals(expected, tracker.getScore(), 0.001); | ||
| } | ||
|
|
||
| @Test | ||
| public void testRecordError() { | ||
| EwmaLatencyTracker tracker = new EwmaLatencyTracker(0.5); | ||
| tracker.update(Duration.ofNanos(100 * 1000)); | ||
|
|
||
| tracker.recordError(Duration.ofNanos(10000 * 1000)); // Score = 0.5 * 10000 + 0.5 * 100 = 5050 | ||
| assertEquals(5050.0, tracker.getScore(), 0.001); | ||
| } | ||
|
|
||
| @Test | ||
| public void testInvalidAlpha() { | ||
| assertThrows(IllegalArgumentException.class, () -> new EwmaLatencyTracker(0.0)); | ||
| assertThrows(IllegalArgumentException.class, () -> new EwmaLatencyTracker(1.1)); | ||
| assertThrows(IllegalArgumentException.class, () -> new EwmaLatencyTracker(-0.1)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAlphaOne() { | ||
| EwmaLatencyTracker tracker = new EwmaLatencyTracker(1.0); | ||
| tracker.update(Duration.ofNanos(100 * 1000)); | ||
| assertEquals(100.0, tracker.getScore(), 0.001); | ||
|
|
||
| tracker.update(Duration.ofNanos(200 * 1000)); | ||
| assertEquals(200.0, tracker.getScore(), 0.001); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getScore() returns Double.MAX_VALUE until the tracker has seen traffic. That means a new endpoint, or one recreated after eviction, will always lose against any sampled endpoint that has historical data. In other words, it never gets traffic, so it never learns. We should be probing / low-rate exploration so a replica can “come back to the game”; this implementation bakes in starvation unless some separate mechanism guarantees exploration.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, that is a good point. We will fix this in a follow-up PR in combination in the
ReplicaSelectorby allowing some of the traffic to just choose a random endpoint.