xds: Align PriorityLB child selection with A56

The PriorityLB predates A56. tryNextPriority() now matches
ChoosePriority() from the gRFC.

The biggest change is waiting on CONNECTING children instead of failing
after the failOverTimer fires. The failOverTimer should be used to start
lower priorities more eagerly, but shouldn't cause the overall
connectivity state to become TRANSIENT_FAILURE on its own. The prior
behavior of creating the "Connection timeout for priority" failing
picker was particularly strange, because it didn't update child's
connectivity state. This previous behavior was creating errors because
of the failOverTimer with no way to diagnose what was going wrong.

b/428517222
This commit is contained in:
Eric Anderson 2025-07-22 10:00:52 -07:00
parent 6ff8ecac09
commit c4256add4d
2 changed files with 37 additions and 21 deletions

View File

@ -154,7 +154,8 @@ final class PriorityLoadBalancer extends LoadBalancer {
ChildLbState child =
new ChildLbState(priority, priorityConfigs.get(priority).ignoreReresolution);
children.put(priority, child);
updateOverallState(priority, CONNECTING, new FixedResultPicker(PickResult.withNoResult()));
// Child is created in CONNECTING with pending failOverTimer
updateOverallState(priority, child.connectivityState, child.picker);
// Calling the child's updateResolvedAddresses() can result in tryNextPriority() being
// called recursively. We need to be sure to be done with processing here before it is
// called.
@ -173,21 +174,23 @@ final class PriorityLoadBalancer extends LoadBalancer {
}
return Status.OK;
}
if (child.failOverTimer != null && child.failOverTimer.isPending()) {
if (child.failOverTimer.isPending()) {
updateOverallState(priority, child.connectivityState, child.picker);
return Status.OK; // Give priority i time to connect.
}
if (priority.equals(currentPriority) && child.connectivityState != TRANSIENT_FAILURE) {
// If the current priority is not changed into TRANSIENT_FAILURE, keep using it.
}
for (int i = 0; i < priorityNames.size(); i++) {
String priority = priorityNames.get(i);
ChildLbState child = children.get(priority);
if (child.connectivityState.equals(CONNECTING)) {
updateOverallState(priority, child.connectivityState, child.picker);
return Status.OK;
}
}
// TODO(zdapeng): Include error details of each priority.
logger.log(XdsLogLevel.DEBUG, "All priority failed");
String lastPriority = priorityNames.get(priorityNames.size() - 1);
SubchannelPicker errorPicker = children.get(lastPriority).picker;
updateOverallState(lastPriority, TRANSIENT_FAILURE, errorPicker);
ChildLbState child = children.get(lastPriority);
updateOverallState(lastPriority, child.connectivityState, child.picker);
return Status.OK;
}
@ -231,10 +234,7 @@ final class PriorityLoadBalancer extends LoadBalancer {
// The child is deactivated.
return;
}
picker = new FixedResultPicker(PickResult.withError(
Status.UNAVAILABLE.withDescription("Connection timeout for priority " + priority)));
logger.log(XdsLogLevel.DEBUG, "Priority {0} failed over to next", priority);
currentPriority = null; // reset currentPriority to guarantee failover happen
Status status = tryNextPriority();
if (!status.isOk()) {
// A child had a problem with the addresses/config. Request it to be refreshed

View File

@ -376,6 +376,7 @@ public class PriorityLoadBalancerTest {
assertThat(fooBalancers).hasSize(2);
assertThat(fooHelpers).hasSize(2);
LoadBalancer balancer1 = Iterables.getLast(fooBalancers);
Helper helper1 = Iterables.getLast(fooHelpers);
// p1 timeout, and fails over to p2
fakeClock.forwardTime(10, TimeUnit.SECONDS);
@ -423,14 +424,20 @@ public class PriorityLoadBalancerTest {
LoadBalancer balancer3 = Iterables.getLast(fooBalancers);
Helper helper3 = Iterables.getLast(fooHelpers);
// p3 timeout then the channel should go to TRANSIENT_FAILURE
// p3 timeout then the channel should stay in CONNECTING
fakeClock.forwardTime(10, TimeUnit.SECONDS);
assertCurrentPickerReturnsError(Status.Code.UNAVAILABLE, "timeout");
assertCurrentPicker(CONNECTING, PickResult.withNoResult());
// p3 fails then the picker should have error status updated
// p3 fails then the picker should still be waiting on p1
helper3.updateBalancingState(
TRANSIENT_FAILURE,
new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("foo"))));
assertCurrentPicker(CONNECTING, PickResult.withNoResult());
// p1 fails then the picker should have error status updated to p3
helper1.updateBalancingState(
TRANSIENT_FAILURE,
new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("bar"))));
assertCurrentPickerReturnsError(Status.Code.DATA_LOSS, "foo");
// p2 gets back to READY
@ -642,6 +649,7 @@ public class PriorityLoadBalancerTest {
assertThat(fooBalancers).hasSize(2);
assertThat(fooHelpers).hasSize(2);
LoadBalancer balancer1 = Iterables.getLast(fooBalancers);
Helper helper1 = Iterables.getLast(fooHelpers);
// p1 timeout, and fails over to p2
fakeClock.forwardTime(10, TimeUnit.SECONDS);
@ -677,14 +685,20 @@ public class PriorityLoadBalancerTest {
LoadBalancer balancer3 = Iterables.getLast(fooBalancers);
Helper helper3 = Iterables.getLast(fooHelpers);
// p3 timeout then the channel should go to TRANSIENT_FAILURE
// p3 timeout then the channel should stay in CONNECTING
fakeClock.forwardTime(10, TimeUnit.SECONDS);
assertCurrentPickerReturnsError(Status.Code.UNAVAILABLE, "timeout");
assertCurrentPicker(CONNECTING, PickResult.withNoResult());
// p3 fails then the picker should have error status updated
// p3 fails then the picker should still be waiting on p1
helper3.updateBalancingState(
TRANSIENT_FAILURE,
new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("foo"))));
assertCurrentPicker(CONNECTING, PickResult.withNoResult());
// p1 fails then the picker should have error status updated to p3
helper1.updateBalancingState(
TRANSIENT_FAILURE,
new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("bar"))));
assertCurrentPickerReturnsError(Status.Code.DATA_LOSS, "foo");
// p2 gets back to IDLE
@ -863,15 +877,17 @@ public class PriorityLoadBalancerTest {
}
private void assertCurrentPickerPicksSubchannel(Subchannel expectedSubchannelToPick) {
assertLatestConnectivityState(READY);
PickResult pickResult = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class));
assertThat(pickResult.getSubchannel()).isEqualTo(expectedSubchannelToPick);
assertCurrentPicker(READY, PickResult.withSubchannel(expectedSubchannelToPick));
}
private void assertCurrentPickerIsBufferPicker() {
assertLatestConnectivityState(IDLE);
assertCurrentPicker(IDLE, PickResult.withNoResult());
}
private void assertCurrentPicker(ConnectivityState state, PickResult result) {
assertLatestConnectivityState(state);
PickResult pickResult = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class));
assertThat(pickResult).isEqualTo(PickResult.withNoResult());
assertThat(pickResult).isEqualTo(result);
}
private Object newChildConfig(LoadBalancerProvider provider, Object config) {