From 907b0030575622a5bb7fb32fb630ada10c51755d Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 21 Aug 2025 12:32:22 -0400 Subject: [PATCH 01/25] Enabling PPAF dynamically. --- .../implementation/GlobalEndpointManager.java | 7 +++++++ .../implementation/RxDocumentClientImpl.java | 18 +++++++++++++++++- .../implementation/UserAgentContainer.java | 2 +- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java index ee7952048db0..7f0894a62ac7 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java @@ -46,6 +46,7 @@ public class GlobalEndpointManager implements AutoCloseable { private volatile boolean isClosed; private volatile DatabaseAccount latestDatabaseAccount; private final AtomicBoolean hasThinClientReadLocations = new AtomicBoolean(false); + private final AtomicBoolean lastRecordedPerPartitionAutomaticFailoverEnabled = new AtomicBoolean(false); private final ReentrantReadWriteLock.WriteLock databaseAccountWriteLock; @@ -53,6 +54,8 @@ public class GlobalEndpointManager implements AutoCloseable { private volatile Throwable latestDatabaseRefreshError; + private volatile Function perPartitionAutomaticFailoverConfigModifier; + public void setLatestDatabaseRefreshError(Throwable latestDatabaseRefreshError) { this.latestDatabaseRefreshError = latestDatabaseRefreshError; } @@ -419,4 +422,8 @@ private List getEffectivePreferredRegions() { this.databaseAccountReadLock.unlock(); } } + + public void setPerPartitionAutomaticFailoverConfigModifier(Function perPartitionAutomaticFailoverConfigModifier) { + this.perPartitionAutomaticFailoverConfigModifier = perPartitionAutomaticFailoverConfigModifier; + } } diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index fa48e6dcc037..019b0f04fa88 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -122,6 +122,7 @@ import java.util.NoSuchElementException; import java.util.Set; import java.util.UUID; +import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentMap; @@ -276,6 +277,7 @@ public class RxDocumentClientImpl implements AsyncDocumentClient, IAuthorization private List operationPolicies; private final AtomicReference cachedCosmosAsyncClientSnapshot; private CosmosEndToEndOperationLatencyPolicyConfig ppafEnforcedE2ELatencyPolicyConfigForReads; + private Function perPartitionAutomaticFailoverConfigModifier; public RxDocumentClientImpl(URI serviceEndpoint, String masterKeyOrResourceToken, @@ -739,6 +741,13 @@ public void init(CosmosClientMetadataCachesSnapshot metadataCachesSnapshot, Func this.globalEndpointManager, this.reactorHttpClient); + this.perPartitionAutomaticFailoverConfigModifier = (databaseAccount -> { + this.initializePerPartitionFailover(databaseAccount); + this.addUserAgentSuffix(this.userAgentContainer, EnumSet.allOf(UserAgentFeatureFlags.class)); + return null; + }); + + this.globalEndpointManager.setPerPartitionAutomaticFailoverConfigModifier(this.perPartitionAutomaticFailoverConfigModifier); this.globalEndpointManager.init(); DatabaseAccount databaseAccountSnapshot = this.initializeGatewayConfigurationReader(); @@ -804,7 +813,7 @@ public void init(CosmosClientMetadataCachesSnapshot metadataCachesSnapshot, Func && readConsistencyStrategy != ReadConsistencyStrategy.SESSION && !sessionCapturingOverrideEnabled); this.sessionContainer.setDisableSessionCapturing(updatedDisableSessionCapturing); - this.initializePerPartitionFailover(databaseAccountSnapshot); +// this.initializePerPartitionFailover(databaseAccountSnapshot); this.addUserAgentSuffix(this.userAgentContainer, EnumSet.allOf(UserAgentFeatureFlags.class)); } catch (Exception e) { logger.error("unexpected failure in initializing client.", e); @@ -7794,6 +7803,13 @@ private void initializePerPartitionCircuitBreaker() { logger.warn("Per-Partition Circuit Breaker is enabled by default when Per-Partition Automatic Failover is enabled."); System.setProperty("COSMOS.PARTITION_LEVEL_CIRCUIT_BREAKER_CONFIG", "{\"isPartitionLevelCircuitBreakerEnabled\": true}"); } + } else { + PartitionLevelCircuitBreakerConfig partitionLevelCircuitBreakerConfig = Configs.getPartitionLevelCircuitBreakerConfig(); + + if (partitionLevelCircuitBreakerConfig != null && !partitionLevelCircuitBreakerConfig.isPartitionLevelCircuitBreakerEnabled()) { + logger.warn("Per-Partition Circuit Breaker is enabled by default when Per-Partition Automatic Failover is enabled."); + System.setProperty("COSMOS.PARTITION_LEVEL_CIRCUIT_BREAKER_CONFIG", "{\"isPartitionLevelCircuitBreakerEnabled\": false}"); + } } this.globalPartitionEndpointManagerForPerPartitionCircuitBreaker.resetCircuitBreakerConfig(); diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java index 767d38e8f124..13f17a705760 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java @@ -17,7 +17,7 @@ public class UserAgentContainer { private final int maxSuffixLength; private final String baseUserAgent; private String suffix; - private String userAgent; + private volatile String userAgent; public final static String AZSDK_USERAGENT_PREFIX = "azsdk-java-"; public final static String BASE_USER_AGENT_STRING = Utils.getUserAgent( From 88d24c02a54d0a2f9b49c7ce15de35fbc207833d Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 21 Aug 2025 13:18:41 -0400 Subject: [PATCH 02/25] Ensure atomicity when opt-ins for PPAF, PPCB and hedging are being modified. --- .../cosmos/implementation/GlobalEndpointManager.java | 8 ++++++++ .../cosmos/implementation/RxDocumentClientImpl.java | 10 ++++------ .../cosmos/implementation/UserAgentContainer.java | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java index 7f0894a62ac7..17ae26831dd9 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java @@ -375,6 +375,14 @@ private Mono getDatabaseAccountAsync(URI serviceEndpoint) { Collection thinClientReadLocations = databaseAccount.getThinClientReadableLocations(); this.hasThinClientReadLocations.set(thinClientReadLocations != null && !thinClientReadLocations.isEmpty()); + Boolean currentPerPartitionAutomaticFailoverEnabled = databaseAccount.isPerPartitionFailoverBehaviorEnabled(); + + if (currentPerPartitionAutomaticFailoverEnabled != null && this.lastRecordedPerPartitionAutomaticFailoverEnabled.get() != currentPerPartitionAutomaticFailoverEnabled) { + this.lastRecordedPerPartitionAutomaticFailoverEnabled.set(currentPerPartitionAutomaticFailoverEnabled); + if (this.perPartitionAutomaticFailoverConfigModifier != null) { + this.perPartitionAutomaticFailoverConfigModifier.apply(databaseAccount); + } + } this.setLatestDatabaseRefreshError(null); } finally { diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index 019b0f04fa88..e1911406d720 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -7817,12 +7817,10 @@ private void initializePerPartitionCircuitBreaker() { } private void enableAvailabilityStrategyForReads() { - if (this.globalPartitionEndpointManagerForPerPartitionAutomaticFailover.isPerPartitionAutomaticFailoverEnabled()) { - this.ppafEnforcedE2ELatencyPolicyConfigForReads = this.evaluatePpafEnforcedE2eLatencyPolicyCfgForReads( - this.globalPartitionEndpointManagerForPerPartitionAutomaticFailover, - this.connectionPolicy - ); - } + this.ppafEnforcedE2ELatencyPolicyConfigForReads = this.evaluatePpafEnforcedE2eLatencyPolicyCfgForReads( + this.globalPartitionEndpointManagerForPerPartitionAutomaticFailover, + this.connectionPolicy + ); } public boolean useThinClient() { diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java index 13f17a705760..e913a15a1638 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java @@ -17,7 +17,7 @@ public class UserAgentContainer { private final int maxSuffixLength; private final String baseUserAgent; private String suffix; - private volatile String userAgent; + private String userAgent; public final static String AZSDK_USERAGENT_PREFIX = "azsdk-java-"; public final static String BASE_USER_AGENT_STRING = Utils.getUserAgent( @@ -39,7 +39,7 @@ public String getSuffix() { return this.suffix; } - public void setFeatureEnabledFlagsAsSuffix(Set userAgentFeatureFlags) { + public synchronized void setFeatureEnabledFlagsAsSuffix(Set userAgentFeatureFlags) { if (userAgentFeatureFlags == null || userAgentFeatureFlags.isEmpty()) { return; } From 740cded9c05ef4437eec6bd7cb9b8dc0dd52f4a8 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 21 Aug 2025 19:32:46 -0400 Subject: [PATCH 03/25] Ensure atomicity when opt-ins for PPAF, PPCB and hedging are being modified. --- ...PerPartitionAutomaticFailoverE2ETests.java | 374 +++++++++++++++++- .../directconnectivity/ReflectionUtils.java | 9 + .../implementation/GlobalEndpointManager.java | 7 +- .../implementation/RxDocumentClientImpl.java | 1 - .../implementation/UserAgentContainer.java | 3 +- ...nagerForPerPartitionAutomaticFailover.java | 6 + ...tManagerForPerPartitionCircuitBreaker.java | 5 + 7 files changed, 392 insertions(+), 13 deletions(-) diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java index 4bd5006e53ba..966a707f50fe 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java @@ -7,6 +7,7 @@ import com.azure.cosmos.implementation.ConnectionPolicy; import com.azure.cosmos.implementation.DatabaseAccount; import com.azure.cosmos.implementation.DatabaseAccountLocation; +import com.azure.cosmos.implementation.DatabaseAccountManagerInternal; import com.azure.cosmos.implementation.ForbiddenException; import com.azure.cosmos.implementation.GlobalEndpointManager; import com.azure.cosmos.implementation.GoneException; @@ -65,6 +66,7 @@ import org.testng.annotations.Factory; import org.testng.annotations.Test; import reactor.core.publisher.Mono; +import reactor.core.publisher.Flux; import java.net.SocketTimeoutException; import java.net.URI; @@ -80,6 +82,7 @@ import java.util.Set; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiConsumer; import static org.assertj.core.api.Assertions.assertThat; @@ -156,6 +159,103 @@ public PerPartitionAutomaticFailoverE2ETests(CosmosClientBuilder clientBuilder) super(clientBuilder); } + @DataProvider(name = "ppafDynamicEnablement503Only") + public Object[][] ppafDynamicEnablement503Only() { + + // When PPAF is disabled -> expect no success, single region contacted (no failover) + ExpectedResponseCharacteristics expectedWhenDisabled = new ExpectedResponseCharacteristics() + .setExpectedMinRetryCount(0) + .setShouldFinalResponseHaveSuccess(false) + .setExpectedRegionsContactedCount(1); + + // When PPAF is enabled -> expect success, single region contacted (directly routed to healthy) + ExpectedResponseCharacteristics expectedWhenEnabled = new ExpectedResponseCharacteristics() + .setExpectedMinRetryCount(1) + .setShouldFinalResponseHaveSuccess(true) + .setExpectedRegionsContactedCount(2); + + return new Object[][]{ + { + "Dynamic enablement: CREATE with SERVICE_UNAVAILABLE/503", + OperationType.Create, + HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, + HttpConstants.SubStatusCodes.SERVER_GENERATED_503, + HttpConstants.StatusCodes.CREATED, + expectedWhenDisabled, + expectedWhenEnabled, + false, + false, + false, + ALL_CONNECTION_MODES + }, + { + "Dynamic enablement: REPLACE with SERVICE_UNAVAILABLE/503", + OperationType.Replace, + HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, + HttpConstants.SubStatusCodes.SERVER_GENERATED_503, + HttpConstants.StatusCodes.OK, + expectedWhenDisabled, + expectedWhenEnabled, + false, + false, + false, + ALL_CONNECTION_MODES + }, + { + "Dynamic enablement: UPSERT with SERVICE_UNAVAILABLE/503", + OperationType.Upsert, + HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, + HttpConstants.SubStatusCodes.SERVER_GENERATED_503, + HttpConstants.StatusCodes.OK, + expectedWhenDisabled, + expectedWhenEnabled, + false, + false, + false, + ALL_CONNECTION_MODES + }, + { + "Dynamic enablement: DELETE with SERVICE_UNAVAILABLE/503", + OperationType.Delete, + HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, + HttpConstants.SubStatusCodes.SERVER_GENERATED_503, + HttpConstants.StatusCodes.NOT_MODIFIED, + expectedWhenDisabled, + expectedWhenEnabled, + false, + false, + false, + ALL_CONNECTION_MODES + }, + { + "Dynamic enablement: PATCH with SERVICE_UNAVAILABLE/503", + OperationType.Patch, + HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, + HttpConstants.SubStatusCodes.SERVER_GENERATED_503, + HttpConstants.StatusCodes.OK, + expectedWhenDisabled, + expectedWhenEnabled, + false, + false, + false, + ALL_CONNECTION_MODES + }, + { + "Dynamic enablement: BATCH with SERVICE_UNAVAILABLE/503", + OperationType.Batch, + HttpConstants.StatusCodes.SERVICE_UNAVAILABLE, + HttpConstants.SubStatusCodes.SERVER_GENERATED_503, + HttpConstants.StatusCodes.OK, + expectedWhenDisabled, + expectedWhenEnabled, + false, + false, + false, + ALL_CONNECTION_MODES + } + }; + } + @BeforeClass(groups = {"multi-region"}) public void beforeClass() { CosmosAsyncClient cosmosAsyncClient = getClientBuilder().buildAsyncClient(); @@ -1145,6 +1245,255 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodes( } } } + @Test(groups = {"multi-region"}, dataProvider = "ppafDynamicEnablement503Only") + public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamicEnablement( + String testType, + OperationType operationType, + int errorStatusCodeToMockFromPartitionInUnhealthyRegion, + int errorSubStatusCodeToMockFromPartitionInUnhealthyRegion, + int successStatusCode, + ExpectedResponseCharacteristics expectedResponseCharacteristicsWhenPpafIsDisabled, + ExpectedResponseCharacteristics expectedResponseCharacteristicsWhenPpafIsEnabled, + boolean shouldThrowNetworkError, + boolean shouldThrowReadTimeoutExceptionWhenNetworkError, + boolean shouldUseE2ETimeout, + Set allowedConnectionModes) { + + ConnectionPolicy connectionPolicy = COSMOS_CLIENT_BUILDER_ACCESSOR.getConnectionPolicy(getClientBuilder()); + ConnectionMode connectionMode = connectionPolicy.getConnectionMode(); + + if (!allowedConnectionModes.contains(connectionMode)) { + throw new SkipException(String.format("Test with type : %s not eligible for specified connection mode %s.", testType, connectionMode)); + } + + if (connectionMode == ConnectionMode.DIRECT) { + TransportClient transportClientMock = Mockito.mock(TransportClient.class); + List preferredRegions = this.accountLevelLocationReadableLocationContext.serviceOrderedReadableRegions; + Map readableRegionNameToEndpoint = this.accountLevelLocationReadableLocationContext.regionNameToEndpoint; + Utils.ValueHolder cosmosAsyncClientValueHolder = new Utils.ValueHolder<>(); + + try { + CosmosClientBuilder cosmosClientBuilder = getClientBuilder(); + + if (operationType.equals(OperationType.Batch) && shouldUseE2ETimeout) { + cosmosClientBuilder.endToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY); + } + + CosmosAsyncClient asyncClient = cosmosClientBuilder.buildAsyncClient(); + cosmosAsyncClientValueHolder.v = asyncClient; + + CosmosAsyncContainer asyncContainer = asyncClient + .getDatabase(this.sharedDatabase.getId()) + .getContainer(this.sharedSinglePartitionContainer.getId()); + + RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) ReflectionUtils.getAsyncDocumentClient(asyncClient); + + // Swap owner on GlobalEndpointManager to return database accounts with toggled PPAF enablement + GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); + DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); + + AtomicReference ppafEnabledRef = new AtomicReference<>(Boolean.FALSE); + DatabaseAccountManagerInternal overridingOwner = new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); + ReflectionUtils.setGlobalEndpointManagerOwner(globalEndpointManager, overridingOwner); + + StoreClient storeClient = ReflectionUtils.getStoreClient(rxDocumentClient); + ReplicatedResourceClient replicatedResourceClient = ReflectionUtils.getReplicatedResourceClient(storeClient); + ConsistencyReader consistencyReader = ReflectionUtils.getConsistencyReader(replicatedResourceClient); + StoreReader storeReader = ReflectionUtils.getStoreReader(consistencyReader); + + ConsistencyWriter consistencyWriter = ReflectionUtils.getConsistencyWriter(replicatedResourceClient); + Utils.ValueHolder> partitionKeyRangesForContainer + = getPartitionKeyRangesForContainer(asyncContainer, rxDocumentClient).block(); + + assertThat(partitionKeyRangesForContainer).isNotNull(); + assertThat(partitionKeyRangesForContainer.v).isNotNull(); + assertThat(partitionKeyRangesForContainer.v.size()).isGreaterThanOrEqualTo(1); + + PartitionKeyRange partitionKeyRangeWithIssues = partitionKeyRangesForContainer.v.get(0); + + assertThat(preferredRegions).isNotNull(); + assertThat(preferredRegions.size()).isGreaterThanOrEqualTo(1); + + String regionWithIssues = preferredRegions.get(0); + RegionalRoutingContext regionalRoutingContextWithIssues = new RegionalRoutingContext(new URI(readableRegionNameToEndpoint.get(regionWithIssues))); + + ReflectionUtils.setTransportClient(storeReader, transportClientMock); + ReflectionUtils.setTransportClient(consistencyWriter, transportClientMock); + + setupTransportClientToReturnSuccessResponse(transportClientMock, constructStoreResponse(operationType, successStatusCode)); + + CosmosException cosmosException = createCosmosException( + errorStatusCodeToMockFromPartitionInUnhealthyRegion, + errorSubStatusCodeToMockFromPartitionInUnhealthyRegion); + + setupTransportClientToThrowCosmosException( + transportClientMock, + partitionKeyRangeWithIssues, + regionalRoutingContextWithIssues, + cosmosException); + + TestItem testItem = TestItem.createNewItem(); + + Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); + + OperationInvocationParamsWrapper operationInvocationParamsWrapper = new OperationInvocationParamsWrapper(); + operationInvocationParamsWrapper.asyncContainer = asyncContainer; + operationInvocationParamsWrapper.createdTestItem = testItem; + operationInvocationParamsWrapper.itemRequestOptions = shouldUseE2ETimeout ? new CosmosItemRequestOptions().setCosmosEndToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY) : new CosmosItemRequestOptions(); + operationInvocationParamsWrapper.patchItemRequestOptions = shouldUseE2ETimeout ? new CosmosPatchItemRequestOptions().setCosmosEndToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY) : new CosmosPatchItemRequestOptions(); + + // Phase 1: PPAF disabled -> expect failure + ppafEnabledRef.set(Boolean.FALSE); + globalEndpointManager.refreshLocationAsync(null, true).block(); + ResponseWrapper responseWithPpafDisabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); + this.validateExpectedResponseCharacteristics.accept(responseWithPpafDisabled, expectedResponseCharacteristicsWhenPpafIsDisabled); + + // Phase 2: PPAF enabled -> expect success + ppafEnabledRef.set(Boolean.TRUE); + globalEndpointManager.refreshLocationAsync(null, true).block(); + ResponseWrapper responseWithPpafEnabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); + this.validateExpectedResponseCharacteristics.accept(responseWithPpafEnabled, expectedResponseCharacteristicsWhenPpafIsEnabled); + + // Phase 3: PPAF disabled -> expect failure again + ppafEnabledRef.set(Boolean.FALSE); + globalEndpointManager.refreshLocationAsync(null, true).block(); + responseWithPpafDisabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); + this.validateExpectedResponseCharacteristics.accept(responseWithPpafDisabled, expectedResponseCharacteristicsWhenPpafIsDisabled); + } catch (Exception e) { + Assertions.fail("The test ran into an exception {}", e); + } finally { + safeClose(cosmosAsyncClientValueHolder.v); + } + } + + if (connectionMode == ConnectionMode.GATEWAY) { + HttpClient mockedHttpClient = Mockito.mock(HttpClient.class); + List preferredRegions = this.accountLevelLocationReadableLocationContext.serviceOrderedReadableRegions; + Map readableRegionNameToEndpoint = this.accountLevelLocationReadableLocationContext.regionNameToEndpoint; + Utils.ValueHolder cosmosAsyncClientValueHolder = new Utils.ValueHolder<>(); + + try { + CosmosClientBuilder cosmosClientBuilder = getClientBuilder(); + + if (operationType.equals(OperationType.Batch) && shouldUseE2ETimeout) { + cosmosClientBuilder.endToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY); + } + + CosmosAsyncClient asyncClient = cosmosClientBuilder.buildAsyncClient(); + cosmosAsyncClientValueHolder.v = asyncClient; + + CosmosAsyncContainer asyncContainer = asyncClient + .getDatabase(this.sharedDatabase.getId()) + .getContainer(this.sharedSinglePartitionContainer.getId()); + + // populates collection cache and pkrange cache + asyncContainer.getFeedRanges().block(); + + RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) ReflectionUtils.getAsyncDocumentClient(asyncClient); + RxStoreModel rxStoreModel = ReflectionUtils.getGatewayProxy(rxDocumentClient); + + GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); + DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); + + AtomicReference ppafEnabledRef = new AtomicReference<>(Boolean.FALSE); + DatabaseAccountManagerInternal overridingOwner = new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); + ReflectionUtils.setGlobalEndpointManagerOwner(globalEndpointManager, overridingOwner); + + DatabaseAccount databaseAccountForResponses = globalEndpointManager.getLatestDatabaseAccount(); + if (databaseAccountForResponses == null) { + // Ensure we have an initial snapshot + globalEndpointManager.refreshLocationAsync(null, true).block(); + databaseAccountForResponses = globalEndpointManager.getLatestDatabaseAccount(); + } + + assertThat(preferredRegions).isNotNull(); + assertThat(preferredRegions.size()).isGreaterThanOrEqualTo(1); + + String regionWithIssues = preferredRegions.get(0); + URI locationEndpointWithIssues = new URI(readableRegionNameToEndpoint.get(regionWithIssues) + "dbs/" + this.sharedDatabase.getId() + "/colls/" + this.sharedSinglePartitionContainer.getId() + "/docs"); + + ReflectionUtils.setGatewayHttpClient(rxStoreModel, mockedHttpClient); + + setupHttpClientToReturnSuccessResponse(mockedHttpClient, operationType, databaseAccountForResponses, successStatusCode); + + CosmosException cosmosException = createCosmosException( + errorStatusCodeToMockFromPartitionInUnhealthyRegion, + errorSubStatusCodeToMockFromPartitionInUnhealthyRegion); + + setupHttpClientToThrowCosmosException( + mockedHttpClient, + locationEndpointWithIssues, + cosmosException, + shouldThrowNetworkError, + shouldThrowReadTimeoutExceptionWhenNetworkError, + shouldUseE2ETimeout); + + TestItem testItem = TestItem.createNewItem(); + + Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); + + OperationInvocationParamsWrapper operationInvocationParamsWrapper = new OperationInvocationParamsWrapper(); + operationInvocationParamsWrapper.asyncContainer = asyncContainer; + operationInvocationParamsWrapper.createdTestItem = testItem; + operationInvocationParamsWrapper.itemRequestOptions = shouldUseE2ETimeout ? new CosmosItemRequestOptions().setCosmosEndToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY) : new CosmosItemRequestOptions(); + operationInvocationParamsWrapper.patchItemRequestOptions = shouldUseE2ETimeout ? new CosmosPatchItemRequestOptions().setCosmosEndToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY) : new CosmosPatchItemRequestOptions(); + + // Phase 1: PPAF disabled -> expect failure + ppafEnabledRef.set(Boolean.FALSE); + globalEndpointManager.refreshLocationAsync(null, true).block(); + ResponseWrapper responseWithPpafDisabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); + this.validateExpectedResponseCharacteristics.accept(responseWithPpafDisabled, expectedResponseCharacteristicsWhenPpafIsDisabled); + + // Phase 2: PPAF enabled -> expect success + ppafEnabledRef.set(Boolean.TRUE); + globalEndpointManager.refreshLocationAsync(null, true).block(); + ResponseWrapper responseWithPpafEnabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); + this.validateExpectedResponseCharacteristics.accept(responseWithPpafEnabled, expectedResponseCharacteristicsWhenPpafIsEnabled); + + // Phase 2: PPAF disabled -> expect failure again + ppafEnabledRef.set(Boolean.FALSE); + globalEndpointManager.refreshLocationAsync(null, true).block(); + responseWithPpafDisabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); + this.validateExpectedResponseCharacteristics.accept(responseWithPpafDisabled, expectedResponseCharacteristicsWhenPpafIsDisabled); + } catch (Exception e) { + Assertions.fail("The test ran into an exception {}", e); + } finally { + safeClose(cosmosAsyncClientValueHolder.v); + } + } + } + + private static class DelegatingDatabaseAccountManagerInternal implements DatabaseAccountManagerInternal { + private final DatabaseAccountManagerInternal delegate; + private final AtomicReference ppafEnabledRef; + + DelegatingDatabaseAccountManagerInternal(DatabaseAccountManagerInternal delegate, AtomicReference ppafEnabledRef) { + this.delegate = delegate; + this.ppafEnabledRef = ppafEnabledRef; + } + + @Override + public Flux getDatabaseAccountFromEndpoint(URI endpoint) { + return delegate.getDatabaseAccountFromEndpoint(endpoint) + .map(dbAccount -> { + Boolean enabled = ppafEnabledRef.get(); + if (enabled != null) { + dbAccount.setIsPerPartitionFailoverBehaviorEnabled(enabled); + } + return dbAccount; + }); + } + + @Override + public ConnectionPolicy getConnectionPolicy() { + return delegate.getConnectionPolicy(); + } + + @Override + public URI getServiceEndpoint() { + return delegate.getServiceEndpoint(); + } + } private void setupTransportClientToThrowCosmosException( TransportClient transportClientMock, @@ -1330,9 +1679,10 @@ private StoreResponse constructStoreResponse(OperationType operationType, int st private static class AccountLevelLocationContext { - private final List serviceOrderedReadableRegions; - private final List serviceOrderedWriteableRegions; - private final Map regionNameToEndpoint; + private final List serviceOrderedReadableRegions; + @SuppressWarnings("unused") + private final List serviceOrderedWriteableRegions; + private final Map regionNameToEndpoint; public AccountLevelLocationContext( List serviceOrderedReadableRegions, @@ -1704,7 +2054,8 @@ private static class FakeBatchResponse { private String retryAfterMilliseconds; - public int getStatusCode() { + @SuppressWarnings("unused") + public int getStatusCode() { return statusCode; } @@ -1713,7 +2064,8 @@ public FakeBatchResponse setStatusCode(int statusCode) { return this; } - public int getSubStatusCode() { + @SuppressWarnings("unused") + public int getSubStatusCode() { return subStatusCode; } @@ -1722,7 +2074,8 @@ public FakeBatchResponse setSubStatusCode(int subStatusCode) { return this; } - public double getRequestCharge() { + @SuppressWarnings("unused") + public double getRequestCharge() { return requestCharge; } @@ -1731,7 +2084,8 @@ public FakeBatchResponse setRequestCharge(double requestCharge) { return this; } - public String geteTag() { + @SuppressWarnings("unused") + public String geteTag() { return eTag; } @@ -1740,7 +2094,8 @@ public FakeBatchResponse seteTag(String eTag) { return this; } - public Object getResourceBody() { + @SuppressWarnings("unused") + public Object getResourceBody() { return resourceBody; } @@ -1749,7 +2104,8 @@ public FakeBatchResponse setResourceBody(Object resourceBody) { return this; } - public String getRetryAfterMilliseconds() { + @SuppressWarnings("unused") + public String getRetryAfterMilliseconds() { return retryAfterMilliseconds; } diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/ReflectionUtils.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/ReflectionUtils.java index 2fe1cef5a869..7678cb677f9b 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/ReflectionUtils.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/ReflectionUtils.java @@ -10,6 +10,7 @@ import com.azure.cosmos.CosmosClientBuilder; import com.azure.cosmos.implementation.ApiType; import com.azure.cosmos.implementation.AsyncDocumentClient; +import com.azure.cosmos.implementation.DatabaseAccountManagerInternal; import com.azure.cosmos.implementation.ClientSideRequestStatistics; import com.azure.cosmos.implementation.Configs; import com.azure.cosmos.implementation.ConnectionPolicy; @@ -247,6 +248,14 @@ public static GlobalEndpointManager getGlobalEndpointManager(RxDocumentClientImp return get(GlobalEndpointManager.class, rxDocumentClient, "globalEndpointManager"); } + public static DatabaseAccountManagerInternal getGlobalEndpointManagerOwner(GlobalEndpointManager globalEndpointManager) { + return get(DatabaseAccountManagerInternal.class, globalEndpointManager, "owner"); + } + + public static void setGlobalEndpointManagerOwner(GlobalEndpointManager globalEndpointManager, DatabaseAccountManagerInternal newOwner) { + set(globalEndpointManager, newOwner, "owner"); + } + public static void setThinProxy(RxDocumentClientImpl client, RxStoreModel storeModel) { set(client, storeModel, "thinProxy"); } diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java index 17ae26831dd9..dd66d69a7425 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java @@ -21,6 +21,7 @@ import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.Objects; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Function; @@ -377,9 +378,11 @@ private Mono getDatabaseAccountAsync(URI serviceEndpoint) { this.hasThinClientReadLocations.set(thinClientReadLocations != null && !thinClientReadLocations.isEmpty()); Boolean currentPerPartitionAutomaticFailoverEnabled = databaseAccount.isPerPartitionFailoverBehaviorEnabled(); - if (currentPerPartitionAutomaticFailoverEnabled != null && this.lastRecordedPerPartitionAutomaticFailoverEnabled.get() != currentPerPartitionAutomaticFailoverEnabled) { - this.lastRecordedPerPartitionAutomaticFailoverEnabled.set(currentPerPartitionAutomaticFailoverEnabled); + if (!Objects.equals(currentPerPartitionAutomaticFailoverEnabled, this.lastRecordedPerPartitionAutomaticFailoverEnabled.get())) { + this.lastRecordedPerPartitionAutomaticFailoverEnabled.set(Boolean.TRUE.equals(currentPerPartitionAutomaticFailoverEnabled)); + if (this.perPartitionAutomaticFailoverConfigModifier != null) { + logger.warn("Per partition automatic failover enabled: {}, applying modifier", currentPerPartitionAutomaticFailoverEnabled); this.perPartitionAutomaticFailoverConfigModifier.apply(databaseAccount); } } diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index e1911406d720..7e50f17b4adc 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -122,7 +122,6 @@ import java.util.NoSuchElementException; import java.util.Set; import java.util.UUID; -import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentMap; diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java index e913a15a1638..6bd5ee911ee5 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java @@ -4,6 +4,7 @@ package com.azure.cosmos.implementation; import java.text.Normalizer; +import java.util.Locale; import java.util.Set; import java.util.regex.Pattern; @@ -50,7 +51,7 @@ public synchronized void setFeatureEnabledFlagsAsSuffix(Set Date: Thu, 21 Aug 2025 19:37:44 -0400 Subject: [PATCH 04/25] Clear state (PPAF + PPCB) when PPAF config modifier callback is invoked. --- .../cosmos/implementation/RxDocumentClientImpl.java | 10 ++++++++-- ...ndpointManagerForPerPartitionAutomaticFailover.java | 3 ++- ...onEndpointManagerForPerPartitionCircuitBreaker.java | 4 +++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index 7e50f17b4adc..883ce9c6649b 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -7805,8 +7805,8 @@ private void initializePerPartitionCircuitBreaker() { } else { PartitionLevelCircuitBreakerConfig partitionLevelCircuitBreakerConfig = Configs.getPartitionLevelCircuitBreakerConfig(); - if (partitionLevelCircuitBreakerConfig != null && !partitionLevelCircuitBreakerConfig.isPartitionLevelCircuitBreakerEnabled()) { - logger.warn("Per-Partition Circuit Breaker is enabled by default when Per-Partition Automatic Failover is enabled."); + if (partitionLevelCircuitBreakerConfig != null && partitionLevelCircuitBreakerConfig.isPartitionLevelCircuitBreakerEnabled()) { + logger.warn("Per-Partition Circuit Breaker is disabled by default when Per-Partition Automatic Failover is disabled."); System.setProperty("COSMOS.PARTITION_LEVEL_CIRCUIT_BREAKER_CONFIG", "{\"isPartitionLevelCircuitBreakerEnabled\": false}"); } } @@ -7820,6 +7820,12 @@ private void enableAvailabilityStrategyForReads() { this.globalPartitionEndpointManagerForPerPartitionAutomaticFailover, this.connectionPolicy ); + + if (this.ppafEnforcedE2ELatencyPolicyConfigForReads != null) { + logger.warn("Per-Partition Automatic Failover enforced E2E Latency Policy for reads is enabled."); + } else { + logger.warn("Per-Partition Automatic Failover enforced E2E Latency Policy for reads is disabled."); + } } public boolean useThinClient() { diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionAutomaticFailover/GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionAutomaticFailover/GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover.java index f1e338c86530..c276bfebf5da 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionAutomaticFailover/GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionAutomaticFailover/GlobalPartitionEndpointManagerForPerPartitionAutomaticFailover.java @@ -325,8 +325,9 @@ public boolean isPerPartitionAutomaticFailoverApplicable(RxDocumentServiceReques return false; } - public void resetPerPartitionAutomaticFailoverEnabled(boolean isPerPartitionAutomaticFailoverEnabled) { + public synchronized void resetPerPartitionAutomaticFailoverEnabled(boolean isPerPartitionAutomaticFailoverEnabled) { this.isPerPartitionAutomaticFailoverEnabled.set(isPerPartitionAutomaticFailoverEnabled); + this.clear(); } public void clear() { diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionCircuitBreaker/GlobalPartitionEndpointManagerForPerPartitionCircuitBreaker.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionCircuitBreaker/GlobalPartitionEndpointManagerForPerPartitionCircuitBreaker.java index f187a5a22bc8..b829294b5027 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionCircuitBreaker/GlobalPartitionEndpointManagerForPerPartitionCircuitBreaker.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionCircuitBreaker/GlobalPartitionEndpointManagerForPerPartitionCircuitBreaker.java @@ -563,9 +563,11 @@ public synchronized void resetCircuitBreakerConfig() { this.locationSpecificHealthContextTransitionHandler = new LocationSpecificHealthContextTransitionHandler(this.consecutiveExceptionBasedCircuitBreaker); + + this.clear(); } - public synchronized void clear() { + private void clear() { this.partitionKeyRangeToLocationSpecificUnavailabilityInfo.clear(); this.regionalRoutingContextToRegion.clear(); } From a033bc8cf98c1e8660a8155323d6315cbf9f2748 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Wed, 3 Sep 2025 11:40:15 -0400 Subject: [PATCH 05/25] Updated CHANGELOG.md --- .../implementation/RxDocumentClientImpl.java | 16 ++++++++++++---- .../implementation/UserAgentFeatureFlags.java | 5 ++++- ...tManagerForPerPartitionAutomaticFailover.java | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index 883ce9c6649b..c77b2e7f0e93 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -276,7 +276,7 @@ public class RxDocumentClientImpl implements AsyncDocumentClient, IAuthorization private List operationPolicies; private final AtomicReference cachedCosmosAsyncClientSnapshot; private CosmosEndToEndOperationLatencyPolicyConfig ppafEnforcedE2ELatencyPolicyConfigForReads; - private Function perPartitionAutomaticFailoverConfigModifier; + private Function perPartitionFailoverConfigModifier; public RxDocumentClientImpl(URI serviceEndpoint, String masterKeyOrResourceToken, @@ -740,13 +740,14 @@ public void init(CosmosClientMetadataCachesSnapshot metadataCachesSnapshot, Func this.globalEndpointManager, this.reactorHttpClient); - this.perPartitionAutomaticFailoverConfigModifier = (databaseAccount -> { + this.perPartitionFailoverConfigModifier + = (databaseAccount -> { this.initializePerPartitionFailover(databaseAccount); this.addUserAgentSuffix(this.userAgentContainer, EnumSet.allOf(UserAgentFeatureFlags.class)); return null; }); - this.globalEndpointManager.setPerPartitionAutomaticFailoverConfigModifier(this.perPartitionAutomaticFailoverConfigModifier); + this.globalEndpointManager.setPerPartitionAutomaticFailoverConfigModifier(this.perPartitionFailoverConfigModifier); this.globalEndpointManager.init(); DatabaseAccount databaseAccountSnapshot = this.initializeGatewayConfigurationReader(); @@ -812,7 +813,6 @@ public void init(CosmosClientMetadataCachesSnapshot metadataCachesSnapshot, Func && readConsistencyStrategy != ReadConsistencyStrategy.SESSION && !sessionCapturingOverrideEnabled); this.sessionContainer.setDisableSessionCapturing(updatedDisableSessionCapturing); -// this.initializePerPartitionFailover(databaseAccountSnapshot); this.addUserAgentSuffix(this.userAgentContainer, EnumSet.allOf(UserAgentFeatureFlags.class)); } catch (Exception e) { logger.error("unexpected failure in initializing client.", e); @@ -1407,6 +1407,14 @@ private void addUserAgentSuffix(UserAgentContainer userAgentContainer, Set Date: Wed, 3 Sep 2025 11:46:49 -0400 Subject: [PATCH 06/25] Updated CHANGELOG.md --- sdk/cosmos/azure-cosmos/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index 64a10ec8a1c1..5ab4b853974e 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -3,6 +3,7 @@ ### 4.74.0-beta.1 (Unreleased) #### Features Added +* Enabled `CosmosClient` to support per-partition automatic failover dynamically without the need to restart the application. - See [PR 46477](https://github.com/Azure/azure-sdk-for-java/pull/46477) #### Breaking Changes From 8126742374a98c1a45346345d60ebba8b4b46764 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 4 Sep 2025 17:04:17 -0400 Subject: [PATCH 07/25] Adding gateway response from non-responsive region in Gateway mode. --- ...PerPartitionAutomaticFailoverE2ETests.java | 456 +++++++++++++++++- .../implementation/RxDocumentClientImpl.java | 4 + .../implementation/RxGatewayStoreModel.java | 4 + 3 files changed, 452 insertions(+), 12 deletions(-) diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java index 966a707f50fe..48b26fdceea0 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java @@ -42,6 +42,7 @@ import com.azure.cosmos.models.CosmosBatch; import com.azure.cosmos.models.CosmosBatchResponse; import com.azure.cosmos.models.CosmosChangeFeedRequestOptions; +import com.azure.cosmos.models.CosmosItemIdentity; import com.azure.cosmos.models.CosmosItemRequestOptions; import com.azure.cosmos.models.CosmosItemResponse; import com.azure.cosmos.models.CosmosPatchItemRequestOptions; @@ -159,6 +160,87 @@ public PerPartitionAutomaticFailoverE2ETests(CosmosClientBuilder clientBuilder) super(clientBuilder); } + // Non-write dynamic enablement scenarios: READ and QUERY (with flavors) under SERVER_GENERATED_GONE and RESPONSE_DELAY + @DataProvider(name = "ppafNonWriteDynamicEnablementScenarios") + public Object[][] ppafNonWriteDynamicEnablementScenarios() { + + Set onlyDirect = new HashSet<>(); + onlyDirect.add(ConnectionMode.DIRECT); + + Set onlyGateway = new HashSet<>(); + onlyGateway.add(ConnectionMode.GATEWAY); + + return new Object[][]{ + // GONE (DIRECT only) + { + "Dynamic non-write: READ with SERVER_GENERATED_GONE (DIRECT)", + OperationType.Read, + QueryFlavor.NONE, + FaultKind.SERVER_GENERATED_GONE, + HttpConstants.StatusCodes.OK, + onlyDirect + }, + { + "Dynamic non-write: QUERY (readAll) with SERVER_GENERATED_GONE (DIRECT)", + OperationType.Query, + QueryFlavor.READ_ALL, + FaultKind.SERVER_GENERATED_GONE, + HttpConstants.StatusCodes.OK, + onlyDirect + }, + { + "Dynamic non-write: QUERY (readMany) with SERVER_GENERATED_GONE (DIRECT)", + OperationType.Query, + QueryFlavor.READ_MANY, + FaultKind.SERVER_GENERATED_GONE, + HttpConstants.StatusCodes.OK, + onlyDirect + }, + { + "Dynamic non-write: QUERY (queryItems) with SERVER_GENERATED_GONE (DIRECT)", + OperationType.Query, + QueryFlavor.QUERY_ITEMS, + FaultKind.SERVER_GENERATED_GONE, + HttpConstants.StatusCodes.OK, + onlyDirect + }, + + // RESPONSE_DELAY (GATEWAY only) + { + "Dynamic non-write: READ with RESPONSE_DELAY (GATEWAY)", + OperationType.Read, + QueryFlavor.NONE, + FaultKind.RESPONSE_DELAY, + HttpConstants.StatusCodes.OK, + onlyGateway + }, + { + "Dynamic non-write: QUERY (readAll) with RESPONSE_DELAY (GATEWAY)", + OperationType.Query, + QueryFlavor.READ_ALL, + FaultKind.RESPONSE_DELAY, + HttpConstants.StatusCodes.OK, + onlyGateway + }, + { + "Dynamic non-write: QUERY (readMany) with RESPONSE_DELAY (GATEWAY)", + OperationType.Query, + QueryFlavor.READ_MANY, + FaultKind.RESPONSE_DELAY, + HttpConstants.StatusCodes.OK, + onlyGateway + }, + { + "Dynamic non-write: QUERY (queryItems) with RESPONSE_DELAY (GATEWAY)", + OperationType.Query, + QueryFlavor.QUERY_ITEMS, + FaultKind.RESPONSE_DELAY, + HttpConstants.StatusCodes.OK, + onlyGateway + } + }; + } + @DataProvider(name = "ppafDynamicEnablement503Only") public Object[][] ppafDynamicEnablement503Only() { @@ -1245,6 +1327,34 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodes( } } } + + /** + * Verifies per-partition automatic failover (PPAF) dynamic enablement by toggling + * DatabaseAccount#isPerPartitionFailoverBehaviorEnabled at runtime via a reflected override + * of GlobalEndpointManager.owner (DatabaseAccountManagerInternal). + * + *

Test strategy

+ *
    + *
  • Build a CosmosAsyncClient from the provided builder.
  • + *
  • Use ReflectionUtils to obtain GlobalEndpointManager from the underlying RxDocumentClient.
  • + *
  • Replace its private owner with a delegating DatabaseAccountManagerInternal that injects + * DatabaseAccount#setIsPerPartitionFailoverBehaviorEnabled(enabledRef.get()).
  • + *
  • Mock transport (DIRECT) or HttpClient (GATEWAY) to simulate a 503 on the primary region + * and success elsewhere, mirroring the base PPAF test.
  • + *
  • Run in phases:
  • + *
+ *
    + *
  1. PPAF disabled — expect failure characteristics (no success).
  2. + *
  3. PPAF enabled — expect success characteristics (routes to healthy).
  4. + *
  5. PPAF disabled again — expect failure again (toggle verified).
  6. + *
+ * + *

After each toggle, call refreshLocationAsync(forceRefresh=true) so GlobalEndpointManager + * observes the updated DatabaseAccount flags immediately.

+ * + *

Expectations are provided by the data provider: when disabled, the request should not succeed; + * when enabled, it should succeed. Works for both DIRECT and GATEWAY connection modes.

+ */ @Test(groups = {"multi-region"}, dataProvider = "ppafDynamicEnablement503Only") public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamicEnablement( String testType, @@ -1266,6 +1376,7 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic throw new SkipException(String.format("Test with type : %s not eligible for specified connection mode %s.", testType, connectionMode)); } + // DIRECT flow: swap transport client, inject error for primary region/PK range, and verify phase-by-phase if (connectionMode == ConnectionMode.DIRECT) { TransportClient transportClientMock = Mockito.mock(TransportClient.class); List preferredRegions = this.accountLevelLocationReadableLocationContext.serviceOrderedReadableRegions; @@ -1288,7 +1399,7 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) ReflectionUtils.getAsyncDocumentClient(asyncClient); - // Swap owner on GlobalEndpointManager to return database accounts with toggled PPAF enablement + // Swap GlobalEndpointManager.owner to a delegating wrapper that toggles PPAF flag on DatabaseAccount GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); @@ -1317,6 +1428,7 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic String regionWithIssues = preferredRegions.get(0); RegionalRoutingContext regionalRoutingContextWithIssues = new RegionalRoutingContext(new URI(readableRegionNameToEndpoint.get(regionWithIssues))); + // Redirect all store calls through our mocked transport client ReflectionUtils.setTransportClient(storeReader, transportClientMock); ReflectionUtils.setTransportClient(consistencyWriter, transportClientMock); @@ -1342,19 +1454,19 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic operationInvocationParamsWrapper.itemRequestOptions = shouldUseE2ETimeout ? new CosmosItemRequestOptions().setCosmosEndToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY) : new CosmosItemRequestOptions(); operationInvocationParamsWrapper.patchItemRequestOptions = shouldUseE2ETimeout ? new CosmosPatchItemRequestOptions().setCosmosEndToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY) : new CosmosPatchItemRequestOptions(); - // Phase 1: PPAF disabled -> expect failure + // Phase 1: PPAF disabled -> expect characteristics provided for DISABLED ppafEnabledRef.set(Boolean.FALSE); globalEndpointManager.refreshLocationAsync(null, true).block(); ResponseWrapper responseWithPpafDisabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); this.validateExpectedResponseCharacteristics.accept(responseWithPpafDisabled, expectedResponseCharacteristicsWhenPpafIsDisabled); - // Phase 2: PPAF enabled -> expect success + // Phase 2: PPAF enabled -> expect characteristics provided for ENABLED ppafEnabledRef.set(Boolean.TRUE); globalEndpointManager.refreshLocationAsync(null, true).block(); ResponseWrapper responseWithPpafEnabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); this.validateExpectedResponseCharacteristics.accept(responseWithPpafEnabled, expectedResponseCharacteristicsWhenPpafIsEnabled); - // Phase 3: PPAF disabled -> expect failure again + // Phase 3: PPAF disabled again -> confirm behavior reverts ppafEnabledRef.set(Boolean.FALSE); globalEndpointManager.refreshLocationAsync(null, true).block(); responseWithPpafDisabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); @@ -1366,6 +1478,7 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic } } + // GATEWAY flow: swap RxGatewayStoreModel HttpClient, inject 503 on primary region and verify phases if (connectionMode == ConnectionMode.GATEWAY) { HttpClient mockedHttpClient = Mockito.mock(HttpClient.class); List preferredRegions = this.accountLevelLocationReadableLocationContext.serviceOrderedReadableRegions; @@ -1386,12 +1499,13 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic .getDatabase(this.sharedDatabase.getId()) .getContainer(this.sharedSinglePartitionContainer.getId()); - // populates collection cache and pkrange cache + // Populate collection and PK range caches to ensure routing is initialized asyncContainer.getFeedRanges().block(); RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) ReflectionUtils.getAsyncDocumentClient(asyncClient); RxStoreModel rxStoreModel = ReflectionUtils.getGatewayProxy(rxDocumentClient); + // Swap GlobalEndpointManager.owner to a delegating wrapper that toggles PPAF flag on DatabaseAccount GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); @@ -1412,6 +1526,7 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic String regionWithIssues = preferredRegions.get(0); URI locationEndpointWithIssues = new URI(readableRegionNameToEndpoint.get(regionWithIssues) + "dbs/" + this.sharedDatabase.getId() + "/colls/" + this.sharedSinglePartitionContainer.getId() + "/docs"); + // Redirect gateway calls through our mocked HttpClient ReflectionUtils.setGatewayHttpClient(rxStoreModel, mockedHttpClient); setupHttpClientToReturnSuccessResponse(mockedHttpClient, operationType, databaseAccountForResponses, successStatusCode); @@ -1438,19 +1553,19 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic operationInvocationParamsWrapper.itemRequestOptions = shouldUseE2ETimeout ? new CosmosItemRequestOptions().setCosmosEndToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY) : new CosmosItemRequestOptions(); operationInvocationParamsWrapper.patchItemRequestOptions = shouldUseE2ETimeout ? new CosmosPatchItemRequestOptions().setCosmosEndToEndOperationLatencyPolicyConfig(THREE_SEC_E2E_TIMEOUT_POLICY) : new CosmosPatchItemRequestOptions(); - // Phase 1: PPAF disabled -> expect failure + // Phase 1: PPAF disabled -> expect characteristics provided for DISABLED ppafEnabledRef.set(Boolean.FALSE); globalEndpointManager.refreshLocationAsync(null, true).block(); ResponseWrapper responseWithPpafDisabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); this.validateExpectedResponseCharacteristics.accept(responseWithPpafDisabled, expectedResponseCharacteristicsWhenPpafIsDisabled); - // Phase 2: PPAF enabled -> expect success + // Phase 2: PPAF enabled -> expect characteristics provided for ENABLED ppafEnabledRef.set(Boolean.TRUE); globalEndpointManager.refreshLocationAsync(null, true).block(); ResponseWrapper responseWithPpafEnabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); this.validateExpectedResponseCharacteristics.accept(responseWithPpafEnabled, expectedResponseCharacteristicsWhenPpafIsEnabled); - // Phase 2: PPAF disabled -> expect failure again + // Phase 3: PPAF disabled again -> confirm behavior reverts ppafEnabledRef.set(Boolean.FALSE); globalEndpointManager.refreshLocationAsync(null, true).block(); responseWithPpafDisabled = dataPlaneOperation.apply(operationInvocationParamsWrapper); @@ -1463,6 +1578,250 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic } } + /** + * Validates hedging and failover behavior for non-write operations (READ/QUERY flavors) under dynamic PPAF enablement. + * + * Semantics: + * - Inject 10+ consecutive faults on the first preferred region. + * - With PPAF enabled, during this window, the operation should hedge to second region and succeed + * (expect 2 contacted regions and >=1 retry). + * - After the window, subsequent calls should go directly to the second region with 0 retries and 1 contacted region. + * + * Faults: + * - SERVER_GENERATED_GONE (DIRECT only): modeled via 410/1002 from primary region on a given PKRange. + * - RESPONSE_DELAY (GATEWAY only): modeled via delayed error/timeout from primary region's URI. + */ + @Test(groups = {"multi-region"}, dataProvider = "ppafNonWriteDynamicEnablementScenarios") + public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( + String testType, + OperationType operationType, + QueryFlavor queryFlavor, + FaultKind faultKind, + int successStatusCode, + Set allowedConnectionModes) { + + ConnectionPolicy connectionPolicy = COSMOS_CLIENT_BUILDER_ACCESSOR.getConnectionPolicy(getClientBuilder()); + ConnectionMode connectionMode = connectionPolicy.getConnectionMode(); + + if (!allowedConnectionModes.contains(connectionMode)) { + throw new SkipException(String.format("Test with type : %s not eligible for specified connection mode %s.", testType, connectionMode)); + } + + final int consecutiveFaults = 10; + + if (connectionMode == ConnectionMode.DIRECT) { + + // Expected during hedging window + ExpectedResponseCharacteristics expectedDuringWindow = new ExpectedResponseCharacteristics() + .setExpectedMinRetryCount(1) + .setShouldFinalResponseHaveSuccess(true) + .setExpectedRegionsContactedCount(2); + + // Expected after failover is established + ExpectedResponseCharacteristics expectedAfterWindow = new ExpectedResponseCharacteristics() + .setExpectedMinRetryCount(0) + .setExpectedMaxRetryCount(0) + .setShouldFinalResponseHaveSuccess(true) + // QUERY_ITEMS is mapped to CosmosAsyncContainer#queryItems whose query string requires a query plan which goes to the non-failed over region + .setExpectedRegionsContactedCount(queryFlavor.equals(QueryFlavor.QUERY_ITEMS) ? 2 : 1); + + TransportClient transportClientMock = Mockito.mock(TransportClient.class); + List preferredRegions = this.accountLevelLocationReadableLocationContext.serviceOrderedReadableRegions; + Map readableRegionNameToEndpoint = this.accountLevelLocationReadableLocationContext.regionNameToEndpoint; + Utils.ValueHolder cosmosAsyncClientValueHolder = new Utils.ValueHolder<>(); + + try { + CosmosClientBuilder cosmosClientBuilder = getClientBuilder(); + CosmosAsyncClient asyncClient = cosmosClientBuilder.buildAsyncClient(); + cosmosAsyncClientValueHolder.v = asyncClient; + + CosmosAsyncContainer asyncContainer = asyncClient + .getDatabase(this.sharedDatabase.getId()) + .getContainer(this.sharedSinglePartitionContainer.getId()); + + RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) ReflectionUtils.getAsyncDocumentClient(asyncClient); + GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); + + Mockito.when(transportClientMock.getGlobalEndpointManager()).thenReturn(globalEndpointManager); + + DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); + AtomicReference ppafEnabledRef = new AtomicReference<>(Boolean.TRUE); + DatabaseAccountManagerInternal overridingOwner = new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); + ReflectionUtils.setGlobalEndpointManagerOwner(globalEndpointManager, overridingOwner); + + StoreClient storeClient = ReflectionUtils.getStoreClient(rxDocumentClient); + ReplicatedResourceClient replicatedResourceClient = ReflectionUtils.getReplicatedResourceClient(storeClient); + ConsistencyReader consistencyReader = ReflectionUtils.getConsistencyReader(replicatedResourceClient); + StoreReader storeReader = ReflectionUtils.getStoreReader(consistencyReader); + ConsistencyWriter consistencyWriter = ReflectionUtils.getConsistencyWriter(replicatedResourceClient); + + Utils.ValueHolder> partitionKeyRangesForContainer + = getPartitionKeyRangesForContainer(asyncContainer, rxDocumentClient).block(); + assertThat(partitionKeyRangesForContainer).isNotNull(); + assertThat(partitionKeyRangesForContainer.v).isNotNull(); + assertThat(partitionKeyRangesForContainer.v.size()).isGreaterThanOrEqualTo(1); + PartitionKeyRange partitionKeyRangeWithIssues = partitionKeyRangesForContainer.v.get(0); + + assertThat(preferredRegions).isNotNull(); + assertThat(preferredRegions.size()).isGreaterThanOrEqualTo(1); + String regionWithIssues = preferredRegions.get(0); + RegionalRoutingContext regionalRoutingContextWithIssues = new RegionalRoutingContext(new URI(readableRegionNameToEndpoint.get(regionWithIssues))); + + ReflectionUtils.setTransportClient(storeReader, transportClientMock); + ReflectionUtils.setTransportClient(consistencyWriter, transportClientMock); + + // Success response when routed to healthy region + setupTransportClientToReturnSuccessResponse(transportClientMock, constructStoreResponse(operationType, successStatusCode)); + + if (faultKind != FaultKind.SERVER_GENERATED_GONE) { + throw new SkipException("DIRECT path only supports SERVER_GENERATED_GONE for this test."); + } + + CosmosException cosmosException = createCosmosException( + HttpConstants.StatusCodes.GONE, + HttpConstants.SubStatusCodes.SERVER_GENERATED_410); + + // Inject fault for first region and PK range + setupTransportClientToThrowCosmosException( + transportClientMock, + partitionKeyRangeWithIssues, + regionalRoutingContextWithIssues, + cosmosException); + + TestItem testItem = TestItem.createNewItem(); + + // Choose operation and query flavor + Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); + OperationInvocationParamsWrapper params = new OperationInvocationParamsWrapper(); + params.asyncContainer = asyncContainer; + params.createdTestItem = testItem; + applyQueryFlavor(params, queryFlavor, testItem); + + DatabaseAccount dbAccountSnapshot = globalEndpointManager.getLatestDatabaseAccount(); + + if (dbAccountSnapshot == null) { + globalEndpointManager.refreshLocationAsync(null, true).block(); + } else { + globalEndpointManager.refreshLocationAsync(dbAccountSnapshot, true).block(); + } + + // Hedging window: perform consecutiveFaults attempts, all should succeed via hedging + for (int i = 0; i < consecutiveFaults; i++) { + ResponseWrapper response = dataPlaneOperation.apply(params); + this.validateExpectedResponseCharacteristics.accept(response, expectedDuringWindow); + } + + // After window: direct to healthy region, expect single region & no retry + ResponseWrapper postWindow = dataPlaneOperation.apply(params); + this.validateExpectedResponseCharacteristics.accept(postWindow, expectedAfterWindow); + } catch (Exception e) { + Assertions.fail("The test ran into an exception {}", e); + } finally { + safeClose(cosmosAsyncClientValueHolder.v); + } + } + + if (connectionMode == ConnectionMode.GATEWAY) { + + // Expected during hedging window + ExpectedResponseCharacteristics expectedDuringWindow = new ExpectedResponseCharacteristics() + // response delay is injected in the first preferred region, so retries are not expected + .setExpectedMinRetryCount(0) + .setExpectedMaxRetryCount(0) + .setShouldFinalResponseHaveSuccess(true) + .setExpectedRegionsContactedCount(2); + + // Expected after failover is established + ExpectedResponseCharacteristics expectedAfterWindow = new ExpectedResponseCharacteristics() + .setExpectedMinRetryCount(0) + .setExpectedMaxRetryCount(0) + .setShouldFinalResponseHaveSuccess(true) + // QUERY_ITEMS is mapped to CosmosAsyncContainer#queryItems whose query string requires a query plan which goes to the non-failed over region + .setExpectedRegionsContactedCount(queryFlavor.equals(QueryFlavor.QUERY_ITEMS) ? 2 : 1); + + HttpClient mockedHttpClient = Mockito.mock(HttpClient.class); + List preferredRegions = this.accountLevelLocationReadableLocationContext.serviceOrderedReadableRegions; + Map readableRegionNameToEndpoint = this.accountLevelLocationReadableLocationContext.regionNameToEndpoint; + Utils.ValueHolder cosmosAsyncClientValueHolder = new Utils.ValueHolder<>(); + + try { + CosmosClientBuilder cosmosClientBuilder = getClientBuilder(); + CosmosAsyncClient asyncClient = cosmosClientBuilder.buildAsyncClient(); + cosmosAsyncClientValueHolder.v = asyncClient; + + CosmosAsyncContainer asyncContainer = asyncClient + .getDatabase(this.sharedDatabase.getId()) + .getContainer(this.sharedSinglePartitionContainer.getId()); + asyncContainer.getFeedRanges().block(); + + RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) ReflectionUtils.getAsyncDocumentClient(asyncClient); + RxStoreModel rxStoreModel = ReflectionUtils.getGatewayProxy(rxDocumentClient); + + GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); + + DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); + AtomicReference ppafEnabledRef = new AtomicReference<>(Boolean.TRUE); + DatabaseAccountManagerInternal overridingOwner = new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); + ReflectionUtils.setGlobalEndpointManagerOwner(globalEndpointManager, overridingOwner); + + assertThat(preferredRegions).isNotNull(); + assertThat(preferredRegions.size()).isGreaterThanOrEqualTo(1); + String regionWithIssues = preferredRegions.get(0); + URI locationEndpointWithIssues = new URI(readableRegionNameToEndpoint.get(regionWithIssues) + "dbs/" + this.sharedDatabase.getId() + "/colls/" + this.sharedSinglePartitionContainer.getId() + "/docs"); + + DatabaseAccount dbAccountSnapshot = globalEndpointManager.getLatestDatabaseAccount(); + + if (dbAccountSnapshot == null) { + globalEndpointManager.refreshLocationAsync(null, true).block(); + } else { + globalEndpointManager.refreshLocationAsync(dbAccountSnapshot, true).block(); + } + + if (faultKind != FaultKind.RESPONSE_DELAY) { + throw new SkipException("GATEWAY path only supports RESPONSE_DELAY for this test."); + } + + ReflectionUtils.setGatewayHttpClient(rxStoreModel, mockedHttpClient); + + // Success path for healthy region + setupHttpClientToReturnSuccessResponse(mockedHttpClient, operationType, dbAccountSnapshot, successStatusCode); + + // Simulate response delay/timeout for primary region only; we return an error Mono after a delay + CosmosException delayedTimeout = createCosmosException(HttpConstants.StatusCodes.REQUEST_TIMEOUT, HttpConstants.SubStatusCodes.GATEWAY_ENDPOINT_READ_TIMEOUT); + Mockito.when( + mockedHttpClient.send( + Mockito.argThat(argument -> { + URI uri = argument.uri(); + return uri.toString().contains(locationEndpointWithIssues.toString()); + }), Mockito.any(Duration.class))) + .thenReturn(Mono.delay(Duration.ofSeconds(10)).flatMap(aLong -> Mono.error(delayedTimeout))); + + TestItem testItem = TestItem.createNewItem(); + + Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); + OperationInvocationParamsWrapper params = new OperationInvocationParamsWrapper(); + params.asyncContainer = asyncContainer; + params.createdTestItem = testItem; + applyQueryFlavor(params, queryFlavor, testItem); + + // Hedging window + for (int i = 0; i < consecutiveFaults; i++) { + ResponseWrapper response = dataPlaneOperation.apply(params); + this.validateExpectedResponseCharacteristics.accept(response, expectedDuringWindow); + } + + // After window + ResponseWrapper postWindow = dataPlaneOperation.apply(params); + this.validateExpectedResponseCharacteristics.accept(postWindow, expectedAfterWindow); + } catch (Exception e) { + Assertions.fail("The test ran into an exception {}", e); + } finally { + safeClose(cosmosAsyncClientValueHolder.v); + } + } + } + + private static class DelegatingDatabaseAccountManagerInternal implements DatabaseAccountManagerInternal { private final DatabaseAccountManagerInternal delegate; private final AtomicReference ppafEnabledRef; @@ -1865,13 +2224,28 @@ private Function> resolveDa CosmosAsyncContainer asyncContainer = paramsWrapper.asyncContainer; CosmosQueryRequestOptions queryRequestOptions = paramsWrapper.queryRequestOptions == null ? new CosmosQueryRequestOptions() : paramsWrapper.queryRequestOptions; queryRequestOptions = paramsWrapper.feedRangeForQuery == null ? queryRequestOptions.setFeedRange(FeedRange.forFullRange()) : queryRequestOptions.setFeedRange(paramsWrapper.feedRangeForQuery); + String sql = paramsWrapper.querySql != null ? paramsWrapper.querySql : "SELECT * FROM c"; try { + // If applyQueryFlavor requested readAllItems or readMany, use those operations instead of query + if (paramsWrapper.readAllPartitionKey != null) { + FeedResponse readAllResponse = asyncContainer + .readAllItems(paramsWrapper.readAllPartitionKey, TestObject.class) + .byPage() + .blockLast(); + return new ResponseWrapper<>(readAllResponse); + } - FeedResponse queryItemResponse = asyncContainer.queryItems( - "SELECT * FROM C", - queryRequestOptions, - TestObject.class) + if (paramsWrapper.readManyIdentities != null && !paramsWrapper.readManyIdentities.isEmpty()) { + FeedResponse readManyResponse = asyncContainer + .readMany(paramsWrapper.readManyIdentities, TestObject.class) + .block(); + return new ResponseWrapper<>(readManyResponse); + } + + // Fallback: regular queryItems + FeedResponse queryItemResponse = asyncContainer + .queryItems(sql, queryRequestOptions, TestObject.class) .byPage() .blockLast(); @@ -2007,6 +2381,11 @@ private static class OperationInvocationParamsWrapper { public CosmosItemRequestOptions patchItemRequestOptions; public FeedRange feedRangeToDrainForChangeFeed; public FeedRange feedRangeForQuery; + public String querySql; + // For QueryFlavor.READ_ALL + public PartitionKey readAllPartitionKey; + // For QueryFlavor.READ_MANY + public List readManyIdentities; } private static class ExpectedResponseCharacteristics { @@ -2115,6 +2494,59 @@ public FakeBatchResponse setRetryAfterMilliseconds(String retryAfterMilliseconds } } + private enum FaultKind { + SERVER_GENERATED_GONE, + RESPONSE_DELAY + } + + private enum QueryFlavor { + NONE, // Not a query + READ_ALL, // SELECT * FROM c + READ_MANY, // Simulate with IN clause + QUERY_ITEMS // Arbitrary filter + } + + private void applyQueryFlavor(OperationInvocationParamsWrapper params, QueryFlavor flavor, TestItem seed) { + if (flavor == QueryFlavor.NONE) { + // Do not set CosmosQueryRequestOptions explicitly + params.querySql = null; + params.readAllPartitionKey = null; + params.readManyIdentities = null; + return; + } + + // Do not set CosmosQueryRequestOptions explicitly; default behavior will be used + + switch (flavor) { + case READ_ALL: + // Map to readAllItems on the container using the seed's partition key + String pk = seed != null ? seed.getId() : UUID.randomUUID().toString(); + params.readAllPartitionKey = new PartitionKey(pk); + params.querySql = null; + params.readManyIdentities = null; + break; + case READ_MANY: + // Map to readMany with one or more identities using the seed + String id = seed != null ? seed.getId() : UUID.randomUUID().toString(); + PartitionKey pkValue = new PartitionKey(id); + List identities = new ArrayList<>(); + identities.add(new CosmosItemIdentity(pkValue, id)); + params.readManyIdentities = identities; + params.readAllPartitionKey = null; + params.querySql = null; + break; + case QUERY_ITEMS: + params.querySql = "SELECT * FROM c WHERE IS_DEFINED(c.mypk)"; + params.readAllPartitionKey = null; + params.readManyIdentities = null; + break; + default: + params.querySql = "SELECT * FROM c"; + params.readAllPartitionKey = null; + params.readManyIdentities = null; + } + } + private HttpResponse createResponse(int statusCode, OperationType operationType, ResourceType resourceType, DatabaseAccount databaseAccount, TestPojo testPojo) { HttpResponse httpResponse = new HttpResponse() { @Override diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index c77b2e7f0e93..5edd9594f98f 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -7213,6 +7213,7 @@ private Mono> wrapPointOperationWithAvailabilityStrat return Mono .firstWithValue(monoList) .flatMap(nonTransientResult -> { + logger.warn("L7216 - mergeContext - nonTransientResult: {}", nonTransientResult); diagnosticsFactory.merge(nonNullRequestOptions); if (nonTransientResult.isError()) { return Mono.error(nonTransientResult.exception); @@ -7991,6 +7992,7 @@ public CosmosDiagnostics getMostRecentlyCreatedDiagnostics() { } public void merge(RequestOptions requestOptions) { + logger.warn("L7995 - merge - ScopedDiagnosticsFactory - merge(RequestOptions requestOptions)"); CosmosDiagnosticsContext knownCtx = null; if (requestOptions != null) { @@ -8026,6 +8028,8 @@ public void merge(CosmosDiagnosticsContext knownCtx) { } for (CosmosDiagnostics diagnostics : this.createdDiagnostics) { + logger.warn("L8031 - merge - (in loop) ScopedDiagnosticsFactory - merging diagnostics: {}", diagnostics); + logger.warn("L8032 - merge - (in loop) ScopedDiagnosticsFactory - merging diagnostics {} - - is empty : {}", diagnostics, diagnosticsAccessor.isNotEmpty(diagnostics)); if (diagnostics.getDiagnosticsContext() == null && diagnosticsAccessor.isNotEmpty(diagnostics)) { if (this.shouldCaptureAllFeedDiagnostics && diagnosticsAccessor.getFeedResponseDiagnostics(diagnostics) != null) { diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java index 676860013631..4df3e5b946b1 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java @@ -541,6 +541,8 @@ private Mono toDocumentServiceResponse(Mono toDocumentServiceResponse(Mono Date: Sun, 7 Sep 2025 17:25:47 -0400 Subject: [PATCH 08/25] Fixing PerPartitionAutomaticFailoverE2ETests. --- ...PerPartitionAutomaticFailoverE2ETests.java | 270 +++++++++++++----- .../implementation/RxDocumentClientImpl.java | 16 ++ .../implementation/UserAgentContainer.java | 5 +- 3 files changed, 212 insertions(+), 79 deletions(-) diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java index 48b26fdceea0..f4c892b57fd7 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java @@ -52,6 +52,17 @@ import com.azure.cosmos.models.FeedResponse; import com.azure.cosmos.models.PartitionKey; import com.azure.cosmos.rx.TestSuiteBase; +import com.azure.cosmos.test.faultinjection.CosmosFaultInjectionHelper; +import com.azure.cosmos.test.faultinjection.FaultInjectionCondition; +import com.azure.cosmos.test.faultinjection.FaultInjectionConditionBuilder; +import com.azure.cosmos.test.faultinjection.FaultInjectionConnectionType; +import com.azure.cosmos.test.faultinjection.FaultInjectionEndpointBuilder; +import com.azure.cosmos.test.faultinjection.FaultInjectionOperationType; +import com.azure.cosmos.test.faultinjection.FaultInjectionResultBuilders; +import com.azure.cosmos.test.faultinjection.FaultInjectionRule; +import com.azure.cosmos.test.faultinjection.FaultInjectionRuleBuilder; +import com.azure.cosmos.test.faultinjection.FaultInjectionServerErrorResult; +import com.azure.cosmos.test.faultinjection.FaultInjectionServerErrorType; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import io.netty.buffer.ByteBuf; @@ -83,6 +94,7 @@ import java.util.Set; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiConsumer; @@ -1579,17 +1591,48 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic } /** - * Validates hedging and failover behavior for non-write operations (READ/QUERY flavors) under dynamic PPAF enablement. + * Validates dynamic Per-Partition Automatic Failover (PPAF) hedging behavior for non-write operations + * (point Read and Query variants). * - * Semantics: - * - Inject 10+ consecutive faults on the first preferred region. - * - With PPAF enabled, during this window, the operation should hedge to second region and succeed - * (expect 2 contacted regions and >=1 retry). - * - After the window, subsequent calls should go directly to the second region with 0 retries and 1 contacted region. + *

Fault models:

+ *
    + *
  • DIRECT: SERVER_GENERATED_GONE (HTTP 410 / substatus 21005) for a targeted partition key range + * in the first preferred region.
  • + *
  • GATEWAY: RESPONSE_DELAY injected (via fault injection rules) for the first preferred region + * (applied to read item, query plan, and query operations).
  • + *
* - * Faults: - * - SERVER_GENERATED_GONE (DIRECT only): modeled via 410/1002 from primary region on a given PKRange. - * - RESPONSE_DELAY (GATEWAY only): modeled via delayed error/timeout from primary region's URI. + *

QueryFlavor mapping:

+ *
    + *
  • NONE: point read (readItem).
  • + *
  • READ_ALL: readAllItems.
  • + *
  • READ_MANY: readMany with supplied identities.
  • + *
  • QUERY_ITEMS: queryItems (requires query plan; may still contact original region post-stabilization).
  • + *
+ * + *

Phases asserted:

+ *
    + *
  1. Hedging window (multiple consecutive injected faults): + *
      + *
    • DIRECT (410): expect >=1 retry and 2 contacted regions.
    • + *
    • GATEWAY (delay): expect 0 retries and 2 contacted regions (hedged).
    • + *
    + *
  2. + *
  3. Post-window stabilization: + *
      + *
    • Routes directly to healthy region (1 contacted region) except QUERY_ITEMS + * which may still require original region for query plan (thus 2).
    • + *
    • Expect 0 retries.
    • + *
    + *
  4. + *
+ * + *

Behavior is parameterized by the ppafNonWriteDynamicEnablementScenarios data provider: + * test type description, operationType (Read/Query), queryFlavor, faultKind, expected success + * status code, and allowed connection modes.

+ * + *

Dynamic enablement is achieved by overriding GlobalEndpointManager's owner to + * inject the PPAF flag into DatabaseAccount snapshots.

*/ @Test(groups = {"multi-region"}, dataProvider = "ppafNonWriteDynamicEnablementScenarios") public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( @@ -1609,20 +1652,20 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( final int consecutiveFaults = 10; + // ===================== DIRECT MODE PATH ===================== if (connectionMode == ConnectionMode.DIRECT) { - // Expected during hedging window + // Build expectations (hedging window vs stabilized post-window) ExpectedResponseCharacteristics expectedDuringWindow = new ExpectedResponseCharacteristics() - .setExpectedMinRetryCount(1) + .setExpectedMinRetryCount(1) // At least one retry due to first region failure .setShouldFinalResponseHaveSuccess(true) - .setExpectedRegionsContactedCount(2); + .setExpectedRegionsContactedCount(2); // Hedging to healthy region - // Expected after failover is established ExpectedResponseCharacteristics expectedAfterWindow = new ExpectedResponseCharacteristics() - .setExpectedMinRetryCount(0) + .setExpectedMinRetryCount(0) // Stable routing .setExpectedMaxRetryCount(0) .setShouldFinalResponseHaveSuccess(true) - // QUERY_ITEMS is mapped to CosmosAsyncContainer#queryItems whose query string requires a query plan which goes to the non-failed over region + // QUERY_ITEMS still requires query plan from original region -> 2 regions contacted .setExpectedRegionsContactedCount(queryFlavor.equals(QueryFlavor.QUERY_ITEMS) ? 2 : 1); TransportClient transportClientMock = Mockito.mock(TransportClient.class); @@ -1631,6 +1674,7 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( Utils.ValueHolder cosmosAsyncClientValueHolder = new Utils.ValueHolder<>(); try { + // Build client and container CosmosClientBuilder cosmosClientBuilder = getClientBuilder(); CosmosAsyncClient asyncClient = cosmosClientBuilder.buildAsyncClient(); cosmosAsyncClientValueHolder.v = asyncClient; @@ -1639,24 +1683,28 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( .getDatabase(this.sharedDatabase.getId()) .getContainer(this.sharedSinglePartitionContainer.getId()); + // Reflection plumbing for internal components RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) ReflectionUtils.getAsyncDocumentClient(asyncClient); GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); - Mockito.when(transportClientMock.getGlobalEndpointManager()).thenReturn(globalEndpointManager); + // Enable dynamic PPAF via delegating owner DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); AtomicReference ppafEnabledRef = new AtomicReference<>(Boolean.TRUE); - DatabaseAccountManagerInternal overridingOwner = new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); + DatabaseAccountManagerInternal overridingOwner = + new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); ReflectionUtils.setGlobalEndpointManagerOwner(globalEndpointManager, overridingOwner); + // Internal store clients StoreClient storeClient = ReflectionUtils.getStoreClient(rxDocumentClient); ReplicatedResourceClient replicatedResourceClient = ReflectionUtils.getReplicatedResourceClient(storeClient); ConsistencyReader consistencyReader = ReflectionUtils.getConsistencyReader(replicatedResourceClient); StoreReader storeReader = ReflectionUtils.getStoreReader(consistencyReader); ConsistencyWriter consistencyWriter = ReflectionUtils.getConsistencyWriter(replicatedResourceClient); - Utils.ValueHolder> partitionKeyRangesForContainer - = getPartitionKeyRangesForContainer(asyncContainer, rxDocumentClient).block(); + // Identify a PK range + first preferred region to fault + Utils.ValueHolder> partitionKeyRangesForContainer = + getPartitionKeyRangesForContainer(asyncContainer, rxDocumentClient).block(); assertThat(partitionKeyRangesForContainer).isNotNull(); assertThat(partitionKeyRangesForContainer.v).isNotNull(); assertThat(partitionKeyRangesForContainer.v.size()).isGreaterThanOrEqualTo(1); @@ -1665,55 +1713,59 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( assertThat(preferredRegions).isNotNull(); assertThat(preferredRegions.size()).isGreaterThanOrEqualTo(1); String regionWithIssues = preferredRegions.get(0); - RegionalRoutingContext regionalRoutingContextWithIssues = new RegionalRoutingContext(new URI(readableRegionNameToEndpoint.get(regionWithIssues))); + RegionalRoutingContext regionalRoutingContextWithIssues = + new RegionalRoutingContext(new URI(readableRegionNameToEndpoint.get(regionWithIssues))); + // Wire mock transport client into reader + writer paths ReflectionUtils.setTransportClient(storeReader, transportClientMock); ReflectionUtils.setTransportClient(consistencyWriter, transportClientMock); // Success response when routed to healthy region - setupTransportClientToReturnSuccessResponse(transportClientMock, constructStoreResponse(operationType, successStatusCode)); + setupTransportClientToReturnSuccessResponse( + transportClientMock, + constructStoreResponse(operationType, successStatusCode)); if (faultKind != FaultKind.SERVER_GENERATED_GONE) { throw new SkipException("DIRECT path only supports SERVER_GENERATED_GONE for this test."); } + // Inject 410/1002 for unhealthy region CosmosException cosmosException = createCosmosException( HttpConstants.StatusCodes.GONE, HttpConstants.SubStatusCodes.SERVER_GENERATED_410); - // Inject fault for first region and PK range setupTransportClientToThrowCosmosException( transportClientMock, partitionKeyRangeWithIssues, regionalRoutingContextWithIssues, cosmosException); + // Prepare operation invocation TestItem testItem = TestItem.createNewItem(); + Function> dataPlaneOperation = + resolveDataPlaneOperation(operationType); - // Choose operation and query flavor - Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); OperationInvocationParamsWrapper params = new OperationInvocationParamsWrapper(); params.asyncContainer = asyncContainer; params.createdTestItem = testItem; applyQueryFlavor(params, queryFlavor, testItem); + // Force initial refresh so DatabaseAccount is loaded with PPAF flag DatabaseAccount dbAccountSnapshot = globalEndpointManager.getLatestDatabaseAccount(); - if (dbAccountSnapshot == null) { globalEndpointManager.refreshLocationAsync(null, true).block(); } else { globalEndpointManager.refreshLocationAsync(dbAccountSnapshot, true).block(); } - // Hedging window: perform consecutiveFaults attempts, all should succeed via hedging - for (int i = 0; i < consecutiveFaults; i++) { - ResponseWrapper response = dataPlaneOperation.apply(params); - this.validateExpectedResponseCharacteristics.accept(response, expectedDuringWindow); - } + // Execute hedging + stabilization phases + runHedgingPhasesForNonWrite( + consecutiveFaults, + dataPlaneOperation, + params, + expectedDuringWindow, + expectedAfterWindow); - // After window: direct to healthy region, expect single region & no retry - ResponseWrapper postWindow = dataPlaneOperation.apply(params); - this.validateExpectedResponseCharacteristics.accept(postWindow, expectedAfterWindow); } catch (Exception e) { Assertions.fail("The test ran into an exception {}", e); } finally { @@ -1721,30 +1773,26 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( } } + // ===================== GATEWAY MODE PATH ===================== if (connectionMode == ConnectionMode.GATEWAY) { - // Expected during hedging window ExpectedResponseCharacteristics expectedDuringWindow = new ExpectedResponseCharacteristics() - // response delay is injected in the first preferred region, so retries are not expected - .setExpectedMinRetryCount(0) + .setExpectedMinRetryCount(0) // Delay fault causes hedging without retries .setExpectedMaxRetryCount(0) .setShouldFinalResponseHaveSuccess(true) .setExpectedRegionsContactedCount(2); - // Expected after failover is established ExpectedResponseCharacteristics expectedAfterWindow = new ExpectedResponseCharacteristics() .setExpectedMinRetryCount(0) .setExpectedMaxRetryCount(0) .setShouldFinalResponseHaveSuccess(true) - // QUERY_ITEMS is mapped to CosmosAsyncContainer#queryItems whose query string requires a query plan which goes to the non-failed over region .setExpectedRegionsContactedCount(queryFlavor.equals(QueryFlavor.QUERY_ITEMS) ? 2 : 1); - HttpClient mockedHttpClient = Mockito.mock(HttpClient.class); List preferredRegions = this.accountLevelLocationReadableLocationContext.serviceOrderedReadableRegions; - Map readableRegionNameToEndpoint = this.accountLevelLocationReadableLocationContext.regionNameToEndpoint; Utils.ValueHolder cosmosAsyncClientValueHolder = new Utils.ValueHolder<>(); try { + // Build client + container CosmosClientBuilder cosmosClientBuilder = getClientBuilder(); CosmosAsyncClient asyncClient = cosmosClientBuilder.buildAsyncClient(); cosmosAsyncClientValueHolder.v = asyncClient; @@ -1752,25 +1800,29 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( CosmosAsyncContainer asyncContainer = asyncClient .getDatabase(this.sharedDatabase.getId()) .getContainer(this.sharedSinglePartitionContainer.getId()); + // Warm caches asyncContainer.getFeedRanges().block(); - RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) ReflectionUtils.getAsyncDocumentClient(asyncClient); - RxStoreModel rxStoreModel = ReflectionUtils.getGatewayProxy(rxDocumentClient); + RxDocumentClientImpl rxDocumentClient = + (RxDocumentClientImpl) ReflectionUtils.getAsyncDocumentClient(asyncClient); - GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); + GlobalEndpointManager globalEndpointManager = + ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); - DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); + // Enable PPAF dynamically + DatabaseAccountManagerInternal originalOwner = + ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); AtomicReference ppafEnabledRef = new AtomicReference<>(Boolean.TRUE); - DatabaseAccountManagerInternal overridingOwner = new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); + DatabaseAccountManagerInternal overridingOwner = + new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); ReflectionUtils.setGlobalEndpointManagerOwner(globalEndpointManager, overridingOwner); assertThat(preferredRegions).isNotNull(); assertThat(preferredRegions.size()).isGreaterThanOrEqualTo(1); String regionWithIssues = preferredRegions.get(0); - URI locationEndpointWithIssues = new URI(readableRegionNameToEndpoint.get(regionWithIssues) + "dbs/" + this.sharedDatabase.getId() + "/colls/" + this.sharedSinglePartitionContainer.getId() + "/docs"); + // Refresh DB account snapshot DatabaseAccount dbAccountSnapshot = globalEndpointManager.getLatestDatabaseAccount(); - if (dbAccountSnapshot == null) { globalEndpointManager.refreshLocationAsync(null, true).block(); } else { @@ -1781,38 +1833,79 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( throw new SkipException("GATEWAY path only supports RESPONSE_DELAY for this test."); } - ReflectionUtils.setGatewayHttpClient(rxStoreModel, mockedHttpClient); - - // Success path for healthy region - setupHttpClientToReturnSuccessResponse(mockedHttpClient, operationType, dbAccountSnapshot, successStatusCode); - - // Simulate response delay/timeout for primary region only; we return an error Mono after a delay - CosmosException delayedTimeout = createCosmosException(HttpConstants.StatusCodes.REQUEST_TIMEOUT, HttpConstants.SubStatusCodes.GATEWAY_ENDPOINT_READ_TIMEOUT); - Mockito.when( - mockedHttpClient.send( - Mockito.argThat(argument -> { - URI uri = argument.uri(); - return uri.toString().contains(locationEndpointWithIssues.toString()); - }), Mockito.any(Duration.class))) - .thenReturn(Mono.delay(Duration.ofSeconds(10)).flatMap(aLong -> Mono.error(delayedTimeout))); - + // Inject RESPONSE_DELAY faults using FIR (read item + query + query plan) + FeedRange fullRange = FeedRange.forFullRange(); + + FaultInjectionServerErrorResult responseDelayError = FaultInjectionResultBuilders + .getResultBuilder(FaultInjectionServerErrorType.RESPONSE_DELAY) + .delay(Duration.ofSeconds(10)) // long enough to trigger hedging + .suppressServiceRequests(false) + .build(); + + FaultInjectionCondition conditionForReadItem = new FaultInjectionConditionBuilder() + .connectionType(FaultInjectionConnectionType.GATEWAY) + .endpoints(new FaultInjectionEndpointBuilder(fullRange).build()) + .operationType(FaultInjectionOperationType.READ_ITEM) + .region(regionWithIssues) + .build(); + + FaultInjectionCondition conditionForQueryPlan = new FaultInjectionConditionBuilder() + .connectionType(FaultInjectionConnectionType.GATEWAY) + .endpoints(new FaultInjectionEndpointBuilder(fullRange).build()) + .operationType(FaultInjectionOperationType.METADATA_REQUEST_QUERY_PLAN) + .region(regionWithIssues) + .build(); + + FaultInjectionCondition conditionForQuery = new FaultInjectionConditionBuilder() + .connectionType(FaultInjectionConnectionType.GATEWAY) + .endpoints(new FaultInjectionEndpointBuilder(fullRange).build()) + .operationType(FaultInjectionOperationType.QUERY_ITEM) + .region(regionWithIssues) + .build(); + + String ruleId = String.format("response-delay-%s", UUID.randomUUID()); + + FaultInjectionRule queryPlanResponseDelayFIRule = new FaultInjectionRuleBuilder(ruleId + "-qp") + .condition(conditionForQueryPlan) + .result(responseDelayError) + .build(); + + FaultInjectionRule queryResponseDelayFIRule = new FaultInjectionRuleBuilder(ruleId + "-q") + .condition(conditionForQuery) + .result(responseDelayError) + .build(); + + FaultInjectionRule readItemResponseDelayFIRule = new FaultInjectionRuleBuilder(ruleId + "-r") + .condition(conditionForReadItem) + .result(responseDelayError) + .build(); + + CosmosFaultInjectionHelper + .configureFaultInjectionRules( + asyncContainer, + Arrays.asList(queryPlanResponseDelayFIRule, queryResponseDelayFIRule, readItemResponseDelayFIRule)) + .block(); + + // Seed item for read/readMany scenarios TestItem testItem = TestItem.createNewItem(); + asyncContainer.createItem(testItem).block(); - Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); + // Prepare params + operation + Function> dataPlaneOperation = + resolveDataPlaneOperation(operationType); OperationInvocationParamsWrapper params = new OperationInvocationParamsWrapper(); params.asyncContainer = asyncContainer; params.createdTestItem = testItem; applyQueryFlavor(params, queryFlavor, testItem); - // Hedging window - for (int i = 0; i < consecutiveFaults; i++) { - ResponseWrapper response = dataPlaneOperation.apply(params); - this.validateExpectedResponseCharacteristics.accept(response, expectedDuringWindow); - } + // Execute hedging + stabilization phases + runHedgingPhasesForNonWrite( + consecutiveFaults, + dataPlaneOperation, + params, + expectedDuringWindow, + expectedAfterWindow); - // After window - ResponseWrapper postWindow = dataPlaneOperation.apply(params); - this.validateExpectedResponseCharacteristics.accept(postWindow, expectedAfterWindow); } catch (Exception e) { Assertions.fail("The test ran into an exception {}", e); } finally { @@ -1821,6 +1914,26 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( } } + /** + * Helper: Executes the hedging window (multiple consecutive fault attempts) followed by a single post-window verification. + */ + private void runHedgingPhasesForNonWrite( + int consecutiveFaults, + Function> dataPlaneOperation, + OperationInvocationParamsWrapper params, + ExpectedResponseCharacteristics expectedDuringWindow, + ExpectedResponseCharacteristics expectedAfterWindow) { + + // Hedging window iterations + for (int i = 0; i < consecutiveFaults; i++) { + ResponseWrapper response = dataPlaneOperation.apply(params); + this.validateExpectedResponseCharacteristics.accept(response, expectedDuringWindow); + } + + // Stabilized post-window request + ResponseWrapper postWindow = dataPlaneOperation.apply(params); + this.validateExpectedResponseCharacteristics.accept(postWindow, expectedAfterWindow); + } private static class DelegatingDatabaseAccountManagerInternal implements DatabaseAccountManagerInternal { private final DatabaseAccountManagerInternal delegate; @@ -2094,7 +2207,7 @@ private Function> resolveDa CosmosItemResponse readItemResponse = asyncContainer.readItem( createdTestObject.getId(), - new PartitionKey(createdTestObject.getId()), + new PartitionKey(createdTestObject.getMypk()), itemRequestOptions, TestObject.class) .block(); @@ -2121,7 +2234,7 @@ private Function> resolveDa CosmosItemResponse upsertItemResponse = asyncContainer.upsertItem( createdTestObject, - new PartitionKey(createdTestObject.getId()), + new PartitionKey(createdTestObject.getMypk()), itemRequestOptions) .block(); @@ -2147,7 +2260,7 @@ private Function> resolveDa CosmosItemResponse createItemResponse = asyncContainer.createItem( createdTestObject, - new PartitionKey(createdTestObject.getId()), + new PartitionKey(createdTestObject.getMypk()), itemRequestOptions) .block(); @@ -2201,7 +2314,7 @@ private Function> resolveDa CosmosItemResponse patchItemResponse = asyncContainer.patchItem( createdTestObject.getId(), - new PartitionKey(createdTestObject.getId()), + new PartitionKey(createdTestObject.getMypk()), patchOperations, patchItemRequestOptions, TestItem.class) @@ -2520,15 +2633,16 @@ private void applyQueryFlavor(OperationInvocationParamsWrapper params, QueryFlav switch (flavor) { case READ_ALL: // Map to readAllItems on the container using the seed's partition key - String pk = seed != null ? seed.getId() : UUID.randomUUID().toString(); - params.readAllPartitionKey = new PartitionKey(pk); + String pkReadAll = seed != null ? seed.getMypk() : UUID.randomUUID().toString(); + params.readAllPartitionKey = new PartitionKey(pkReadAll); params.querySql = null; params.readManyIdentities = null; break; case READ_MANY: // Map to readMany with one or more identities using the seed String id = seed != null ? seed.getId() : UUID.randomUUID().toString(); - PartitionKey pkValue = new PartitionKey(id); + String pkReadMany = seed != null ? seed.getMypk() : UUID.randomUUID().toString(); + PartitionKey pkValue = new PartitionKey(pkReadMany); List identities = new ArrayList<>(); identities.add(new CosmosItemIdentity(pkValue, id)); params.readManyIdentities = identities; diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index 5edd9594f98f..8b42e24ea500 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -1246,6 +1246,22 @@ private Flux> createQueryInternal( UUID activityId, final AtomicBoolean isQueryCancelledOnTimeout) { + // reevaluate e2e policy config on cosmosQueryRequestOptions + if (options != null) { + CosmosEndToEndOperationLatencyPolicyConfig endToEndPolicyConfigFromRequestOptions = + getEndToEndOperationLatencyPolicyConfig( + ImplementationBridgeHelpers + .CosmosQueryRequestOptionsHelper + .getCosmosQueryRequestOptionsAccessor() + .toRequestOptions(options), + resourceTypeEnum, + OperationType.Query); + + if (endToEndPolicyConfigFromRequestOptions != null) { + options.setCosmosEndToEndOperationLatencyPolicyConfig(endToEndPolicyConfigFromRequestOptions); + } + } + Flux> executionContext = DocumentQueryExecutionContextFactory .createDocumentQueryExecutionContextAsync(diagnosticsClientContext, queryClient, resourceTypeEnum, klass, sqlQuery, diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java index 6bd5ee911ee5..bd5699867504 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java @@ -18,7 +18,8 @@ public class UserAgentContainer { private final int maxSuffixLength; private final String baseUserAgent; private String suffix; - private String userAgent; + private volatile String userAgent; + private String baseUserAgentWithSuffix; public final static String AZSDK_USERAGENT_PREFIX = "azsdk-java-"; public final static String BASE_USER_AGENT_STRING = Utils.getUserAgent( @@ -51,6 +52,7 @@ public synchronized void setFeatureEnabledFlagsAsSuffix(Set Date: Mon, 8 Sep 2025 10:04:59 -0400 Subject: [PATCH 09/25] Fixing PerPartitionAutomaticFailoverE2ETests. --- ...PerPartitionAutomaticFailoverE2ETests.java | 192 +++++++++++++++--- .../DiagnosticsClientContext.java | 14 +- .../implementation/GlobalEndpointManager.java | 10 +- .../implementation/RxDocumentClientImpl.java | 5 +- .../implementation/RxGatewayStoreModel.java | 2 - .../implementation/UserAgentContainer.java | 62 ++++-- 6 files changed, 230 insertions(+), 55 deletions(-) diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java index f4c892b57fd7..27ed5bf6104a 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java @@ -94,12 +94,123 @@ import java.util.Set; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiConsumer; import static org.assertj.core.api.Assertions.assertThat; +/** + * End-to-end test suite validating Per-Partition Automatic Failover (PPAF) behavior in the Azure Cosmos DB Java SDK. + * + *

This suite exercises and verifies: + *

    + *
  • Automatic failover and hedged routing at the granularity of a single physical partition (PK range).
  • + *
  • Dynamic enablement and disablement of PPAF at runtime by reflecting and overriding + * {@code GlobalEndpointManager}'s {@code DatabaseAccountManagerInternal} owner to toggle + * {@code DatabaseAccount#isPerPartitionFailoverBehaviorEnabled}.
  • + *
  • Write operation failover (Create, Replace, Upsert, Delete, Patch, Batch) under multiple failover‑eligible + * status/sub-status combinations (410/21005, 503/21008, 403/3, 408/*, gateway read timeouts).
  • + *
  • Non-write hedging behavior (point Read and Query variants) under region-scoped transient faults: + *
      + *
    • DIRECT mode: simulated server-generated 410 (sub-status 21005) scoped to a specific partition key range.
    • + *
    • GATEWAY mode: injected RESPONSE_DELAY faults via fault injection rules (query plan + query + read item).
    • + *
    + *
  • + *
  • Interaction with end-to-end latency policies (E2E timeout) as a gating mechanism for enabling failover logic.
  • + *
+ * + *

Connection Modes Covered: + *

    + *
  • DIRECT: Uses a mocked {@code TransportClient} to selectively throw {@code CosmosException} for a targeted + * (region + PK range) while returning success for others.
  • + *
  • GATEWAY: Uses a mocked {@code HttpClient} (or fault injection framework) to simulate service errors, network + * timeouts (socket/read), regional delays, or success responses.
  • + *
+ * + *

Phased Validation Patterns: + *

    + *
  • Pre-failover / Hedging Window: Verifies retries or region hedging (multi-region contacts) before + * PPCB-enforced failover, optionally repeated to satisfy E2E timeout activation thresholds.
  • + *
  • Post-failover / Stabilized: Ensures subsequent operations route directly to a healthy region + * (single-region contact, zero retries) unless query semantics (e.g., query plan retrieval) require multi-region access.
  • + *
  • Dynamic Enablement Toggle: For selected 503 scenarios, validates behavior transitions + * Disabled → Enabled → Disabled, confirming routing and diagnostics adapt immediately after + * {@code refreshLocationAsync(true)}.
  • + *
+ * + *

Diagnostics Assertions: + * Each test inspects {@code CosmosDiagnostics} to assert: + *

    + *
  • Contacted region count (hedged vs stabilized).
  • + *
  • Retry count bounds (min/max) aligned with scenario expectations.
  • + *
  • Final HTTP status classification (success vs expected failure when failover gated).
  • + *
  • Consistency across response types (item, batch, feed, or exception paths).
  • + *
+ * + *

Key Internal Mechanisms: + *

    + *
  • Reflection-based access to internal SDK components (e.g., {@code RxDocumentClientImpl}, + * {@code StoreReader}, {@code ConsistencyWriter}) to inject mocked transport layers.
  • + *
  • Custom delegating {@code DatabaseAccountManagerInternal} wrapper that conditionally sets + * the per-partition failover flag on retrieved {@code DatabaseAccount} snapshots.
  • + *
  • Fault injection rules in GATEWAY mode to apply controlled latency (RESPONSE_DELAY) per region and operation type.
  • + *
  • Reusable operation dispatch via a functional resolver mapping {@code OperationType} to execution lambdas + * returning a uniform {@code ResponseWrapper} abstraction.
  • + *
+ * + *

Query Variants (QueryFlavor): + *

    + *
  • {@code NONE}: Point read (readItem).
  • + *
  • {@code READ_ALL}: {@code readAllItems} over a single partition key.
  • + *
  • {@code READ_MANY}: {@code readMany} with one or more item identities.
  • + *
  • {@code QUERY_ITEMS}: Standard SQL query; may still contact original region for query plan acquisition even + * after stabilization (thus dual-region diagnostics may persist).
  • + *
+ * + *

End-to-End Latency Policy Integration: + * Tests optionally apply a short-circuit latency policy to: + *

    + *
  • Simulate threshold-based activation (e.g., property {@code COSMOS.E2E_TIMEOUT_ERROR_HIT_THRESHOLD_FOR_PPAF}).
  • + *
  • Differentiate pre-threshold (no failover) vs post-threshold (failover enabled) diagnostics for the same fault.
  • + *
+ * + *

Batch Operation Coverage: + * Batch scenarios ensure that failover and diagnostics behaviors remain consistent with single-item operations, + * including mock batch response materialization and hedging logic validation.

+ * + *

Safety & Cleanup: + * Each scenario ensures: + *

    + *
  • System properties used to gate PPAF or E2E behaviors are cleared in {@code finally} blocks.
  • + *
  • Clients are safely disposed to avoid cross-test interference.
  • + *
+ * + *

Usage Notes: + * This suite relies on internal APIs and reflection hooks not intended for production use. It is crafted specifically + * for validation of resilience, routing, and diagnostics fidelity across complex multi-region and transient-fault + * conditions. Adjustments to internal SDK contracts may require corresponding test maintenance.

+ * + *

Failure Interpretation: + * A test failure typically indicates one of: + *

    + *
  • Unexpected retry amplification or suppression.
  • + *
  • Incorrect region routing (e.g., failover not triggered or not stabilized).
  • + *
  • Diagnostics context regression (missing region names, status codes, or retry metrics).
  • + *
  • Latency policy mis-integration (threshold not honored).
  • + *
+ * + *

Extensibility: + * Additional scenarios (e.g., new fault types, new operation categories, multi-partition batch coverage, or read feed streaming) + * can be added by: + *

    + *
  1. Extending the appropriate {@code @DataProvider} with new parameter rows.
  2. + *
  3. Enhancing {@code resolveDataPlaneOperation} for new operation abstractions.
  4. + *
  5. Adding new fault injection builders or transport client predicates.
  6. + *
+ * + *

All validations aim to ensure that PPAF delivers predictable, minimal-latency routing under regional fault pressure + * while preserving observability through {@code CosmosDiagnostics}.

+ */ public class PerPartitionAutomaticFailoverE2ETests extends TestSuiteBase { private CosmosAsyncDatabase sharedDatabase; @@ -1110,17 +1221,33 @@ public Object[][] ppafTestConfigsWithWriteOps() { }; } - // testPpafWithWriteFailoverWithEligibleErrorStatusCodes does the following: - // for DIRECT connection mode, - // an availability failure (410, 503, 408) or write forbidden failure (403/3) is injected - // for a given partitionKeyRange and region through mocking - // the first operation execution for a given operation type is expected to see failures and then failover (403/3s & 503s & 408s (not e2e timeout hit) are retried and e2e time out hit (408:20008) just see the operation fail) - // the second operation execution should see the request go straight away to the failed over region - caveat is when e2e timeout is hit, only after x failures does a failover happen - // for GATEWAY connection mode, - // an availability failure (503, 408), write forbidden failure (403/3) and I/O failures are injected - // for a given region through mocking - // the first operation execution for a given operation type is expected to see failures and then failover (403/3s & 503s & 408s (not e2e timeout hit) are retried and e2e time out hit (408:20008) just see the operation fail) - // the second operation execution should see the request go straight away to the failed over region - caveat is when e2e timeout is hit, only after x failures does a failover happen + /** + * End-to-end validation of Per-Partition Automatic Failover (PPAF) for write operations + * (Create, Replace, Upsert, Delete, Patch, Batch) when a failover-eligible fault is injected + * for one partition key range in the first preferred region. + * + *

Phases:

+ *
    + *
  • Pre-failover: injected error surfaces; request retries and/or hedges (unless gated by E2E timeout threshold).
  • + *
  • Post-failover: subsequent request routes directly to healthy region (single region, zero retries) unless + * E2E timeout gating still accumulating threshold.
  • + *
+ * + *

Mechanics:

+ *
    + *
  • DIRECT: TransportClient mocked; targeted (region + PK range) throws configured CosmosException.
  • + *
  • GATEWAY: HttpClient mocked; targeted region URI throws CosmosException or network exception (read/socket timeout) or delayed fault.
  • + *
  • GlobalEndpointManager owner replaced with delegating manager to surface dynamic PPAF enablement flag.
  • + *
  • E2E latency policy optionally applied; threshold (COSMOS.E2E_TIMEOUT_ERROR_HIT_THRESHOLD_FOR_PPAF) controls activation.
  • + *
+ * + *

Assertions:

+ *
    + *
  • Regions contacted count (before vs after failover).
  • + *
  • Retry count bounds.
  • + *
  • Success vs failure based on phase and configuration.
  • + *
+ */ @Test(groups = {"multi-region"}, dataProvider = "ppafTestConfigsWithWriteOps") public void testPpafWithWriteFailoverWithEligibleErrorStatusCodes( String testType, @@ -1155,8 +1282,6 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodes( System.setProperty("COSMOS.E2E_TIMEOUT_ERROR_HIT_THRESHOLD_FOR_PPAF", "2"); } - System.setProperty("COSMOS.IS_PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED", "true"); - CosmosClientBuilder cosmosClientBuilder = getClientBuilder(); // todo: evaluate whether Batch operation needs op-level e2e timeout and availability strategy @@ -1209,6 +1334,17 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodes( regionalRoutingContextWithIssues, cosmosException); + // Swap GlobalEndpointManager.owner to a delegating wrapper that toggles PPAF flag on DatabaseAccount + GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); + DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); + + AtomicReference ppafEnabledRef = new AtomicReference<>(Boolean.TRUE); + DatabaseAccountManagerInternal overridingOwner = new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); + ReflectionUtils.setGlobalEndpointManagerOwner(globalEndpointManager, overridingOwner); + + DatabaseAccount latestDatabaseAccountSnapshot = globalEndpointManager.getLatestDatabaseAccount(); + globalEndpointManager.refreshLocationAsync(latestDatabaseAccountSnapshot, true).block(); + TestItem testItem = TestItem.createNewItem(); Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); @@ -1255,8 +1391,6 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodes( System.setProperty("COSMOS.E2E_TIMEOUT_ERROR_HIT_THRESHOLD_FOR_PPAF", "2"); } - System.setProperty("COSMOS.IS_PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED", "true"); - CosmosClientBuilder cosmosClientBuilder = getClientBuilder(); // todo: evaluate whether Batch operation needs op-level e2e timeout and availability strategy @@ -1281,6 +1415,16 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodes( GlobalEndpointManager globalEndpointManager = ReflectionUtils.getGlobalEndpointManager(rxDocumentClient); DatabaseAccount databaseAccount = globalEndpointManager.getLatestDatabaseAccount(); + // Swap GlobalEndpointManager.owner to a delegating wrapper that toggles PPAF flag on DatabaseAccount + DatabaseAccountManagerInternal originalOwner = ReflectionUtils.getGlobalEndpointManagerOwner(globalEndpointManager); + + AtomicReference ppafEnabledRef = new AtomicReference<>(Boolean.TRUE); + DatabaseAccountManagerInternal overridingOwner = new DelegatingDatabaseAccountManagerInternal(originalOwner, ppafEnabledRef); + ReflectionUtils.setGlobalEndpointManagerOwner(globalEndpointManager, overridingOwner); + + DatabaseAccount latestDatabaseAccountSnapshot = globalEndpointManager.getLatestDatabaseAccount(); + globalEndpointManager.refreshLocationAsync(latestDatabaseAccountSnapshot, true).block(); + assertThat(preferredRegions).isNotNull(); assertThat(preferredRegions.size()).isGreaterThanOrEqualTo(1); @@ -1729,7 +1873,7 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( throw new SkipException("DIRECT path only supports SERVER_GENERATED_GONE for this test."); } - // Inject 410/1002 for unhealthy region + // Inject 410/21005 for unhealthy region CosmosException cosmosException = createCosmosException( HttpConstants.StatusCodes.GONE, HttpConstants.SubStatusCodes.SERVER_GENERATED_410); @@ -1949,9 +2093,7 @@ public Flux getDatabaseAccountFromEndpoint(URI endpoint) { return delegate.getDatabaseAccountFromEndpoint(endpoint) .map(dbAccount -> { Boolean enabled = ppafEnabledRef.get(); - if (enabled != null) { - dbAccount.setIsPerPartitionFailoverBehaviorEnabled(enabled); - } + dbAccount.setIsPerPartitionFailoverBehaviorEnabled(enabled); return dbAccount; }); } @@ -2494,11 +2636,11 @@ private static class OperationInvocationParamsWrapper { public CosmosItemRequestOptions patchItemRequestOptions; public FeedRange feedRangeToDrainForChangeFeed; public FeedRange feedRangeForQuery; - public String querySql; - // For QueryFlavor.READ_ALL - public PartitionKey readAllPartitionKey; - // For QueryFlavor.READ_MANY - public List readManyIdentities; + public String querySql; + // For QueryFlavor.READ_ALL + public PartitionKey readAllPartitionKey; + // For QueryFlavor.READ_MANY + public List readManyIdentities; } private static class ExpectedResponseCharacteristics { diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java index 4a0409a594b0..56b28af1063f 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java @@ -64,7 +64,7 @@ public void serialize(DiagnosticsClientConfig clientConfig, JsonGenerator genera generator.writeStringField("machineId", ClientTelemetry.getMachineId(clientConfig)); generator.writeStringField("connectionMode", clientConfig.getConnectionMode().toString()); generator.writeNumberField("numberOfClients", clientConfig.getActiveClientsCount()); - generator.writeStringField("isPpafEnabled", Configs.isPerPartitionAutomaticFailoverEnabled()); + generator.writeStringField("isPpafEnabled", clientConfig.isPerPartitionAutomaticFailoverEnabledAsString); generator.writeStringField("isFalseProgSessionTokenMergeEnabled", Configs.isSessionTokenFalseProgressMergeEnabled() ? "true" : "false"); generator.writeStringField("excrgns", clientConfig.excludedRegionsRelatedConfig()); generator.writeObjectFieldStart("clientEndpoints"); @@ -132,6 +132,7 @@ class DiagnosticsClientConfig { private String sessionRetryOptionsAsString; private String regionScopedSessionContainerOptionsAsString; private String partitionLevelCircuitBreakerConfigAsString; + private String isPerPartitionAutomaticFailoverEnabledAsString; public DiagnosticsClientConfig withMachineId(String machineId) { this.machineId = machineId; @@ -254,6 +255,17 @@ public DiagnosticsClientConfig withPartitionLevelCircuitBreakerConfig(PartitionL return this; } + public DiagnosticsClientConfig withIsPerPartitionAutomaticFailoverEnabled(boolean isPpafEnabled) { + + if (isPpafEnabled) { + this.isPerPartitionAutomaticFailoverEnabledAsString = "true"; + } else { + this.isPerPartitionAutomaticFailoverEnabledAsString = "false"; + } + + return this; + } + public DiagnosticsClientConfig withRegionScopedSessionContainerOptions(RegionScopedSessionContainer regionScopedSessionContainer) { if (regionScopedSessionContainer == null) { diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java index dd66d69a7425..e7f6c7ed88bd 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java @@ -47,7 +47,7 @@ public class GlobalEndpointManager implements AutoCloseable { private volatile boolean isClosed; private volatile DatabaseAccount latestDatabaseAccount; private final AtomicBoolean hasThinClientReadLocations = new AtomicBoolean(false); - private final AtomicBoolean lastRecordedPerPartitionAutomaticFailoverEnabled = new AtomicBoolean(false); + private final AtomicBoolean lastRecordedPerPartitionAutomaticFailoverEnabledOnClient = new AtomicBoolean(Configs.isPerPartitionAutomaticFailoverEnabled().equalsIgnoreCase("true")); private final ReentrantReadWriteLock.WriteLock databaseAccountWriteLock; @@ -376,13 +376,13 @@ private Mono getDatabaseAccountAsync(URI serviceEndpoint) { Collection thinClientReadLocations = databaseAccount.getThinClientReadableLocations(); this.hasThinClientReadLocations.set(thinClientReadLocations != null && !thinClientReadLocations.isEmpty()); - Boolean currentPerPartitionAutomaticFailoverEnabled = databaseAccount.isPerPartitionFailoverBehaviorEnabled(); + Boolean currentPerPartitionAutomaticFailoverEnabledFromService = databaseAccount.isPerPartitionFailoverBehaviorEnabled(); - if (!Objects.equals(currentPerPartitionAutomaticFailoverEnabled, this.lastRecordedPerPartitionAutomaticFailoverEnabled.get())) { - this.lastRecordedPerPartitionAutomaticFailoverEnabled.set(Boolean.TRUE.equals(currentPerPartitionAutomaticFailoverEnabled)); + if (currentPerPartitionAutomaticFailoverEnabledFromService != null && !Objects.equals(currentPerPartitionAutomaticFailoverEnabledFromService, this.lastRecordedPerPartitionAutomaticFailoverEnabledOnClient.get())) { + this.lastRecordedPerPartitionAutomaticFailoverEnabledOnClient.set(currentPerPartitionAutomaticFailoverEnabledFromService); if (this.perPartitionAutomaticFailoverConfigModifier != null) { - logger.warn("Per partition automatic failover enabled: {}, applying modifier", currentPerPartitionAutomaticFailoverEnabled); + logger.warn("Per partition automatic failover enabled: {}, applying modifier", currentPerPartitionAutomaticFailoverEnabledFromService); this.perPartitionAutomaticFailoverConfigModifier.apply(databaseAccount); } } diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index 8b42e24ea500..34dc83b5e173 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -7229,7 +7229,6 @@ private Mono> wrapPointOperationWithAvailabilityStrat return Mono .firstWithValue(monoList) .flatMap(nonTransientResult -> { - logger.warn("L7216 - mergeContext - nonTransientResult: {}", nonTransientResult); diagnosticsFactory.merge(nonNullRequestOptions); if (nonTransientResult.isError()) { return Mono.error(nonTransientResult.exception); @@ -7802,6 +7801,7 @@ private synchronized void initializePerPartitionFailover(DatabaseAccount databas checkNotNull(this.globalPartitionEndpointManagerForPerPartitionCircuitBreaker, "Argument 'globalPartitionEndpointManagerForPerPartitionCircuitBreaker' cannot be null."); this.diagnosticsClientConfig.withPartitionLevelCircuitBreakerConfig(this.globalPartitionEndpointManagerForPerPartitionCircuitBreaker.getCircuitBreakerConfig()); + this.diagnosticsClientConfig.withIsPerPartitionAutomaticFailoverEnabled(this.globalPartitionEndpointManagerForPerPartitionAutomaticFailover.isPerPartitionAutomaticFailoverEnabled()); } private void initializePerPartitionAutomaticFailover(DatabaseAccount databaseAccountSnapshot) { @@ -8008,7 +8008,6 @@ public CosmosDiagnostics getMostRecentlyCreatedDiagnostics() { } public void merge(RequestOptions requestOptions) { - logger.warn("L7995 - merge - ScopedDiagnosticsFactory - merge(RequestOptions requestOptions)"); CosmosDiagnosticsContext knownCtx = null; if (requestOptions != null) { @@ -8044,8 +8043,6 @@ public void merge(CosmosDiagnosticsContext knownCtx) { } for (CosmosDiagnostics diagnostics : this.createdDiagnostics) { - logger.warn("L8031 - merge - (in loop) ScopedDiagnosticsFactory - merging diagnostics: {}", diagnostics); - logger.warn("L8032 - merge - (in loop) ScopedDiagnosticsFactory - merging diagnostics {} - - is empty : {}", diagnostics, diagnosticsAccessor.isNotEmpty(diagnostics)); if (diagnostics.getDiagnosticsContext() == null && diagnosticsAccessor.isNotEmpty(diagnostics)) { if (this.shouldCaptureAllFeedDiagnostics && diagnosticsAccessor.getFeedResponseDiagnostics(diagnostics) != null) { diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java index 4df3e5b946b1..a4170e09a951 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java @@ -541,8 +541,6 @@ private Mono toDocumentServiceResponse(Mono userAgentFeatureFlags) { - if (userAgentFeatureFlags == null || userAgentFeatureFlags.isEmpty()) { - return; - } + public void setFeatureEnabledFlagsAsSuffix(Set userAgentFeatureFlags) { + writeLock.lock(); + try { + if (userAgentFeatureFlags == null || userAgentFeatureFlags.isEmpty()) { + return; + } - int value = 0; + int value = 0; - for (UserAgentFeatureFlags userAgentFeatureFlag : userAgentFeatureFlags) { - value += userAgentFeatureFlag.getValue(); - } + for (UserAgentFeatureFlags userAgentFeatureFlag : userAgentFeatureFlags) { + value += userAgentFeatureFlag.getValue(); + } - this.userAgent = this.baseUserAgentWithSuffix; - this.userAgent = this.userAgent + "|F" + Integer.toHexString(value).toUpperCase(Locale.ROOT); + this.userAgent = !Strings.isNullOrEmpty(this.baseUserAgentWithSuffix) ? this.baseUserAgentWithSuffix : this.baseUserAgent; + this.userAgent = this.userAgent + "|F" + Integer.toHexString(value).toUpperCase(Locale.ROOT); + } finally { + writeLock.unlock(); + } } public void setSuffix(String suffix) { - if (suffix.length() > maxSuffixLength) { - suffix = suffix.substring(0, maxSuffixLength); + writeLock.lock(); + try { + if (suffix == null) { + suffix = ""; + } + + if (suffix.length() > maxSuffixLength) { + suffix = suffix.substring(0, maxSuffixLength); + } + + this.suffix = suffix; + this.userAgent = stripNonAsciiCharacters(baseUserAgent.concat(" ").concat(this.suffix)); + this.baseUserAgentWithSuffix = this.userAgent; + } finally { + writeLock.unlock(); } - - this.suffix = suffix; - this.userAgent = stripNonAsciiCharacters(baseUserAgent.concat(" ").concat(this.suffix)); - this.baseUserAgentWithSuffix = this.userAgent; } public String getUserAgent() { - return this.userAgent; + readLock.lock(); + try { + return this.userAgent; + } finally { + readLock.unlock(); + } } private static String stripNonAsciiCharacters(String input) { From b00a217010f21f4c49bd3e19783aff2441b016e7 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Mon, 8 Sep 2025 10:19:42 -0400 Subject: [PATCH 10/25] Fixing PerPartitionAutomaticFailoverE2ETests. --- ...PerPartitionAutomaticFailoverE2ETests.java | 45 +++++++++---------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java index 27ed5bf6104a..c7f09c818050 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionAutomaticFailoverE2ETests.java @@ -38,7 +38,6 @@ import com.azure.cosmos.implementation.http.HttpResponse; import com.azure.cosmos.implementation.routing.PartitionKeyInternalHelper; import com.azure.cosmos.implementation.routing.RegionalRoutingContext; -import com.azure.cosmos.implementation.throughputControl.TestItem; import com.azure.cosmos.models.CosmosBatch; import com.azure.cosmos.models.CosmosBatchResponse; import com.azure.cosmos.models.CosmosChangeFeedRequestOptions; @@ -1345,7 +1344,7 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodes( DatabaseAccount latestDatabaseAccountSnapshot = globalEndpointManager.getLatestDatabaseAccount(); globalEndpointManager.refreshLocationAsync(latestDatabaseAccountSnapshot, true).block(); - TestItem testItem = TestItem.createNewItem(); + TestObject testItem = TestObject.create(); Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); @@ -1447,7 +1446,7 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodes( shouldThrowReadTimeoutExceptionWhenNetworkError, shouldUseE2ETimeout); - TestItem testItem = TestItem.createNewItem(); + TestObject testItem = TestObject.create(); Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); @@ -1600,7 +1599,7 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic regionalRoutingContextWithIssues, cosmosException); - TestItem testItem = TestItem.createNewItem(); + TestObject testItem = TestObject.create(); Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); @@ -1699,7 +1698,7 @@ public void testPpafWithWriteFailoverWithEligibleErrorStatusCodesWithPpafDynamic shouldThrowReadTimeoutExceptionWhenNetworkError, shouldUseE2ETimeout); - TestItem testItem = TestItem.createNewItem(); + TestObject testItem = TestObject.create(); Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); @@ -1885,7 +1884,7 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( cosmosException); // Prepare operation invocation - TestItem testItem = TestItem.createNewItem(); + TestObject testItem = TestObject.create(); Function> dataPlaneOperation = resolveDataPlaneOperation(operationType); @@ -2031,7 +2030,7 @@ public void testFailoverBehaviorForNonWriteOperationsWithPpafDynamicEnablement( .block(); // Seed item for read/readMany scenarios - TestItem testItem = TestItem.createNewItem(); + TestObject testItem = TestObject.create(); asyncContainer.createItem(testItem).block(); // Prepare params + operation @@ -2342,7 +2341,7 @@ private Function> resolveDa return (paramsWrapper) -> { CosmosAsyncContainer asyncContainer = paramsWrapper.asyncContainer; - TestItem createdTestObject = paramsWrapper.createdTestItem; + TestObject createdTestObject = paramsWrapper.createdTestItem; CosmosItemRequestOptions itemRequestOptions = paramsWrapper.itemRequestOptions; try { @@ -2369,12 +2368,12 @@ private Function> resolveDa return (paramsWrapper) -> { CosmosAsyncContainer asyncContainer = paramsWrapper.asyncContainer; - TestItem createdTestObject = paramsWrapper.createdTestItem; + TestObject createdTestObject = paramsWrapper.createdTestItem; CosmosItemRequestOptions itemRequestOptions = paramsWrapper.itemRequestOptions; try { - CosmosItemResponse upsertItemResponse = asyncContainer.upsertItem( + CosmosItemResponse upsertItemResponse = asyncContainer.upsertItem( createdTestObject, new PartitionKey(createdTestObject.getMypk()), itemRequestOptions) @@ -2395,12 +2394,12 @@ private Function> resolveDa return (paramsWrapper) -> { CosmosAsyncContainer asyncContainer = paramsWrapper.asyncContainer; - TestItem createdTestObject = TestItem.createNewItem(); + TestObject createdTestObject = TestObject.create(); CosmosItemRequestOptions itemRequestOptions = paramsWrapper.itemRequestOptions; try { - CosmosItemResponse createItemResponse = asyncContainer.createItem( + CosmosItemResponse createItemResponse = asyncContainer.createItem( createdTestObject, new PartitionKey(createdTestObject.getMypk()), itemRequestOptions) @@ -2421,7 +2420,7 @@ private Function> resolveDa return (paramsWrapper) -> { CosmosAsyncContainer asyncContainer = paramsWrapper.asyncContainer; - TestItem createdTestObject = paramsWrapper.createdTestItem; + TestObject createdTestObject = paramsWrapper.createdTestItem; CosmosItemRequestOptions itemRequestOptions = paramsWrapper.itemRequestOptions; try { @@ -2447,19 +2446,19 @@ private Function> resolveDa return (paramsWrapper) -> { CosmosAsyncContainer asyncContainer = paramsWrapper.asyncContainer; - TestItem createdTestObject = paramsWrapper.createdTestItem; + TestObject createdTestObject = paramsWrapper.createdTestItem; CosmosPatchItemRequestOptions patchItemRequestOptions = (CosmosPatchItemRequestOptions) paramsWrapper.patchItemRequestOptions; CosmosPatchOperations patchOperations = CosmosPatchOperations.create().add("/number", 555); try { - CosmosItemResponse patchItemResponse = asyncContainer.patchItem( + CosmosItemResponse patchItemResponse = asyncContainer.patchItem( createdTestObject.getId(), new PartitionKey(createdTestObject.getMypk()), patchOperations, patchItemRequestOptions, - TestItem.class) + TestObject.class) .block(); return new ResponseWrapper<>(patchItemResponse); @@ -2519,12 +2518,12 @@ private Function> resolveDa return (paramsWrapper) -> { CosmosAsyncContainer asyncContainer = paramsWrapper.asyncContainer; - TestItem createdTestObject = paramsWrapper.createdTestItem; + TestObject createdTestObject = paramsWrapper.createdTestItem; CosmosItemRequestOptions itemRequestOptions = paramsWrapper.itemRequestOptions; try { - CosmosItemResponse deleteItemResponse = asyncContainer.replaceItem( + CosmosItemResponse deleteItemResponse = asyncContainer.replaceItem( createdTestObject, createdTestObject.getId(), new PartitionKey(createdTestObject.getId()), @@ -2545,7 +2544,7 @@ private Function> resolveDa case Batch: return (paramsWrapper) -> { - TestItem testObject = TestItem.createNewItem(); + TestObject testObject = TestObject.create(); CosmosBatch batch = CosmosBatch.createCosmosBatch(new PartitionKey(testObject.getId())); CosmosAsyncContainer asyncContainer = paramsWrapper.asyncContainer; @@ -2570,9 +2569,9 @@ private Function> resolveDa try { - FeedResponse feedResponseFromChangeFeed = asyncContainer.queryChangeFeed( + FeedResponse feedResponseFromChangeFeed = asyncContainer.queryChangeFeed( CosmosChangeFeedRequestOptions.createForProcessingFromBeginning(paramsWrapper.feedRangeToDrainForChangeFeed == null ? FeedRange.forFullRange() : paramsWrapper.feedRangeToDrainForChangeFeed), - TestItem.class) + TestObject.class) .byPage() .blockLast(); @@ -2630,7 +2629,7 @@ private static class ResponseWrapper { private static class OperationInvocationParamsWrapper { public CosmosAsyncContainer asyncContainer; - public TestItem createdTestItem; + public TestObject createdTestItem; public CosmosItemRequestOptions itemRequestOptions; public CosmosQueryRequestOptions queryRequestOptions; public CosmosItemRequestOptions patchItemRequestOptions; @@ -2761,7 +2760,7 @@ private enum QueryFlavor { QUERY_ITEMS // Arbitrary filter } - private void applyQueryFlavor(OperationInvocationParamsWrapper params, QueryFlavor flavor, TestItem seed) { + private void applyQueryFlavor(OperationInvocationParamsWrapper params, QueryFlavor flavor, TestObject seed) { if (flavor == QueryFlavor.NONE) { // Do not set CosmosQueryRequestOptions explicitly params.querySql = null; From ee4556245f14e0ac62731d09464b1b71a3c7d610 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Mon, 8 Sep 2025 13:13:12 -0400 Subject: [PATCH 11/25] Updated CHANGELOG.md --- sdk/cosmos/azure-cosmos/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index 9a57f6451106..b45b1cc063de 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -3,6 +3,7 @@ ### 4.75.0-beta.1 (Unreleased) #### Features Added +* Enabled `CosmosClient` to support per-partition automatic failover dynamically without the need to restart the application. - See [PR 46477](https://github.com/Azure/azure-sdk-for-java/pull/46477) #### Breaking Changes From 39da8a2b829e91169df6d9dd4fff824a54718d00 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 11 Sep 2025 17:27:43 -0400 Subject: [PATCH 12/25] Attempt to fix diagnostics. --- .../com/azure/cosmos/implementation/RxGatewayStoreModel.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java index 764f961fd93b..8c863731386e 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java @@ -547,6 +547,8 @@ private Mono toDocumentServiceResponse(Mono toDocumentServiceResponse(Mono Date: Sun, 14 Sep 2025 14:09:21 -0400 Subject: [PATCH 13/25] Have PPCB rely on user-enforced sys prop when PPAF is disabled on account. --- .../implementation/RxDocumentClientImpl.java | 29 ++++++++---------- .../implementation/RxGatewayStoreModel.java | 30 +++++++++++++++++-- ...tManagerForPerPartitionCircuitBreaker.java | 10 +++---- .../PartitionLevelCircuitBreakerConfig.java | 16 ++++++++++ 4 files changed, 60 insertions(+), 25 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index 63b4c5982267..0a9aaa7761bc 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -751,6 +751,7 @@ public void init(CosmosClientMetadataCachesSnapshot metadataCachesSnapshot, Func this.globalEndpointManager.setPerPartitionAutomaticFailoverConfigModifier(this.perPartitionFailoverConfigModifier); this.globalEndpointManager.init(); + this.initializePerPartitionCircuitBreaker(); DatabaseAccount databaseAccountSnapshot = this.initializeGatewayConfigurationReader(); this.resetSessionContainerIfNeeded(databaseAccountSnapshot); @@ -7374,8 +7375,7 @@ private CosmosEndToEndOperationLatencyPolicyConfig evaluatePpafEnforcedE2eLatenc if (Configs.isReadAvailabilityStrategyEnabledWithPpaf()) { - logger.warn("Availability strategy for reads, queries, read all and read many" + - " is enabled when PerPartitionAutomaticFailover is enabled."); + logger.warn("As Per-Partition Automatic Failover (PPAF) is enabled a default End-to-End Operation Latency Policy will be applied for read, query, readAll and readyMany operation types."); if (connectionPolicy.getConnectionMode() == ConnectionMode.DIRECT) { Duration networkRequestTimeout = connectionPolicy.getTcpNetworkRequestTimeout(); @@ -7833,24 +7833,19 @@ private void initializePerPartitionAutomaticFailover(DatabaseAccount databaseAcc } private void initializePerPartitionCircuitBreaker() { - if (this.globalPartitionEndpointManagerForPerPartitionAutomaticFailover.isPerPartitionAutomaticFailoverEnabled()) { - PartitionLevelCircuitBreakerConfig partitionLevelCircuitBreakerConfig = Configs.getPartitionLevelCircuitBreakerConfig(); + PartitionLevelCircuitBreakerConfig partitionLevelCircuitBreakerConfig; - if (partitionLevelCircuitBreakerConfig != null && !partitionLevelCircuitBreakerConfig.isPartitionLevelCircuitBreakerEnabled()) { - logger.warn("Per-Partition Circuit Breaker is enabled by default when Per-Partition Automatic Failover is enabled."); - System.setProperty("COSMOS.PARTITION_LEVEL_CIRCUIT_BREAKER_CONFIG", "{\"isPartitionLevelCircuitBreakerEnabled\": true}"); - } + if (this.globalPartitionEndpointManagerForPerPartitionAutomaticFailover.isPerPartitionAutomaticFailoverEnabled()) { + // Override custom config to enabled if PPAF is enabled + logger.warn("Per-Partition Circuit Breaker is enabled because Per-Partition Automatic Failover is enabled."); + partitionLevelCircuitBreakerConfig = PartitionLevelCircuitBreakerConfig.fromExplicitArgs(Boolean.TRUE); } else { - PartitionLevelCircuitBreakerConfig partitionLevelCircuitBreakerConfig = Configs.getPartitionLevelCircuitBreakerConfig(); - - if (partitionLevelCircuitBreakerConfig != null && partitionLevelCircuitBreakerConfig.isPartitionLevelCircuitBreakerEnabled()) { - logger.warn("Per-Partition Circuit Breaker is disabled by default when Per-Partition Automatic Failover is disabled."); - System.setProperty("COSMOS.PARTITION_LEVEL_CIRCUIT_BREAKER_CONFIG", "{\"isPartitionLevelCircuitBreakerEnabled\": false}"); - } + logger.warn("As Per-Partition Automatic Failover is disabled, Per-Partition Circuit Breaker will be enabled or disabled based on client configuration."); + partitionLevelCircuitBreakerConfig = Configs.getPartitionLevelCircuitBreakerConfig(); } - this.globalPartitionEndpointManagerForPerPartitionCircuitBreaker.resetCircuitBreakerConfig(); + this.globalPartitionEndpointManagerForPerPartitionCircuitBreaker.resetCircuitBreakerConfig(partitionLevelCircuitBreakerConfig); this.globalPartitionEndpointManagerForPerPartitionCircuitBreaker.init(); } @@ -7861,9 +7856,9 @@ private void enableAvailabilityStrategyForReads() { ); if (this.ppafEnforcedE2ELatencyPolicyConfigForReads != null) { - logger.warn("Per-Partition Automatic Failover enforced E2E Latency Policy for reads is enabled."); + logger.warn("Per-Partition Automatic Failover (PPAF) enforced E2E Latency Policy for reads is enabled."); } else { - logger.warn("Per-Partition Automatic Failover enforced E2E Latency Policy for reads is disabled."); + logger.warn("Per-Partition Automatic Failover (PPAF) enforced E2E Latency Policy for reads is disabled."); } } diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java index 8c863731386e..48dce105561b 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxGatewayStoreModel.java @@ -558,8 +558,34 @@ private Mono toDocumentServiceResponse(Mono globalAddressResolverSnapshot; private final ConcurrentHashMap regionalRoutingContextToRegion; private final AtomicBoolean isClosed = new AtomicBoolean(false); + private final AtomicBoolean isPartitionRecoveryTaskRunning = new AtomicBoolean(false); private final Scheduler partitionRecoveryScheduler = Schedulers.newSingle( "partition-availability-staleness-check", true); @@ -65,8 +66,8 @@ public GlobalPartitionEndpointManagerForPerPartitionCircuitBreaker(GlobalEndpoin } public void init() { - if (this.consecutiveExceptionBasedCircuitBreaker.isPartitionLevelCircuitBreakerEnabled()) { - this.updateStaleLocationInfo().subscribeOn(this.partitionRecoveryScheduler).subscribe(); + if (this.consecutiveExceptionBasedCircuitBreaker.isPartitionLevelCircuitBreakerEnabled() && !this.isPartitionRecoveryTaskRunning.get()) { + this.updateStaleLocationInfo().subscribeOn(this.partitionRecoveryScheduler).doOnSubscribe(ignore -> this.isPartitionRecoveryTaskRunning.set(true)).subscribe(); } } @@ -554,10 +555,7 @@ public PartitionLevelCircuitBreakerConfig getCircuitBreakerConfig() { return this.consecutiveExceptionBasedCircuitBreaker.getPartitionLevelCircuitBreakerConfig(); } - public synchronized void resetCircuitBreakerConfig() { - PartitionLevelCircuitBreakerConfig partitionLevelCircuitBreakerConfig - = Configs.getPartitionLevelCircuitBreakerConfig(); - + public synchronized void resetCircuitBreakerConfig(PartitionLevelCircuitBreakerConfig partitionLevelCircuitBreakerConfig) { this.consecutiveExceptionBasedCircuitBreaker = new ConsecutiveExceptionBasedCircuitBreaker(partitionLevelCircuitBreakerConfig); diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionCircuitBreaker/PartitionLevelCircuitBreakerConfig.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionCircuitBreaker/PartitionLevelCircuitBreakerConfig.java index 672fa5ef830d..011337201301 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionCircuitBreaker/PartitionLevelCircuitBreakerConfig.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/perPartitionCircuitBreaker/PartitionLevelCircuitBreakerConfig.java @@ -76,4 +76,20 @@ public static PartitionLevelCircuitBreakerConfig fromJsonString(String jsonStrin throw new RuntimeException("Unable to convert from Json String", e); } } + + public static PartitionLevelCircuitBreakerConfig fromExplicitArgs( + Boolean isPartitionLevelCircuitBreakerEnabled) { + + PartitionLevelCircuitBreakerConfig config = new PartitionLevelCircuitBreakerConfig(); + + if (isPartitionLevelCircuitBreakerEnabled != null) { + config.isPartitionLevelCircuitBreakerEnabled = isPartitionLevelCircuitBreakerEnabled; + } + + config.circuitBreakerType = DEFAULT.getCircuitBreakerType(); + config.consecutiveExceptionCountToleratedForReads = DEFAULT.getConsecutiveExceptionCountToleratedForReads(); + config.consecutiveExceptionCountToleratedForWrites = DEFAULT.getConsecutiveExceptionCountToleratedForWrites(); + + return config; + } } From c49e853df77f40ff9a9334b381b19554481b3931 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Sun, 14 Sep 2025 14:14:02 -0400 Subject: [PATCH 14/25] Move return early logic before lock acquisition. --- .../azure/cosmos/implementation/UserAgentContainer.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java index 76d0cc395923..ddffd20ae2e6 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/UserAgentContainer.java @@ -49,11 +49,13 @@ public String getSuffix() { } public void setFeatureEnabledFlagsAsSuffix(Set userAgentFeatureFlags) { + + if (userAgentFeatureFlags == null || userAgentFeatureFlags.isEmpty()) { + return; + } + writeLock.lock(); try { - if (userAgentFeatureFlags == null || userAgentFeatureFlags.isEmpty()) { - return; - } int value = 0; From 88f6fb7dfbbb5b418a6d28dd363572f68b79c007 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Sun, 14 Sep 2025 14:24:23 -0400 Subject: [PATCH 15/25] Modify PPAF config change detection logic. --- .../cosmos/implementation/GlobalEndpointManager.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java index c1b4777aa14f..6233987130b5 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java @@ -368,10 +368,10 @@ private Mono getDatabaseAccountAsync(URI serviceEndpoint) { Collection thinClientReadLocations = databaseAccount.getThinClientReadableLocations(); this.hasThinClientReadLocations.set(thinClientReadLocations != null && !thinClientReadLocations.isEmpty()); - Boolean currentPerPartitionAutomaticFailoverEnabledFromService = databaseAccount.isPerPartitionFailoverBehaviorEnabled(); - if (currentPerPartitionAutomaticFailoverEnabledFromService != null && !Objects.equals(currentPerPartitionAutomaticFailoverEnabledFromService, this.lastRecordedPerPartitionAutomaticFailoverEnabledOnClient.get())) { - this.lastRecordedPerPartitionAutomaticFailoverEnabledOnClient.set(currentPerPartitionAutomaticFailoverEnabledFromService); + if (this.hasPerPartitionAutomaticFailoverConfigChanged(databaseAccount)) { + Boolean currentPerPartitionAutomaticFailoverEnabledFromService = databaseAccount.isPerPartitionFailoverBehaviorEnabled(); + this.lastRecordedPerPartitionAutomaticFailoverEnabledOnClient.set(Boolean.TRUE.equals(currentPerPartitionAutomaticFailoverEnabledFromService)); if (this.perPartitionAutomaticFailoverConfigModifier != null) { logger.warn("Per partition automatic failover enabled: {}, applying modifier", currentPerPartitionAutomaticFailoverEnabledFromService); @@ -429,4 +429,9 @@ private List getEffectivePreferredRegions() { public void setPerPartitionAutomaticFailoverConfigModifier(Function perPartitionAutomaticFailoverConfigModifier) { this.perPartitionAutomaticFailoverConfigModifier = perPartitionAutomaticFailoverConfigModifier; } + + private boolean hasPerPartitionAutomaticFailoverConfigChanged(DatabaseAccount databaseAccount) { + Boolean currentPerPartitionAutomaticFailoverEnabledFromService = databaseAccount.isPerPartitionFailoverBehaviorEnabled(); + return currentPerPartitionAutomaticFailoverEnabledFromService != null && !Objects.equals(currentPerPartitionAutomaticFailoverEnabledFromService, this.lastRecordedPerPartitionAutomaticFailoverEnabledOnClient.get()); + } } From ad65018f83100317051bc63e31d24835414ac039 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Sun, 14 Sep 2025 18:34:44 -0400 Subject: [PATCH 16/25] Fixing a cross-region availability strategy test where if suppressRequest needs to be set to true. --- .../cosmos/PerPartitionCircuitBreakerE2ETests.java | 2 +- .../cosmos/implementation/RxDocumentClientImpl.java | 10 ++++++++-- .../cosmos/implementation/RxGatewayStoreModel.java | 2 ++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionCircuitBreakerE2ETests.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionCircuitBreakerE2ETests.java index 7cbae5749237..2fa9a5e2693e 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionCircuitBreakerE2ETests.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/PerPartitionCircuitBreakerE2ETests.java @@ -4731,7 +4731,7 @@ private static List buildGwResponseDelayInjectionRulesNotSco FaultInjectionServerErrorResult faultInjectionServerErrorResult = FaultInjectionResultBuilders .getResultBuilder(FaultInjectionServerErrorType.RESPONSE_DELAY) .delay(paramsWrapper.getResponseDelay()) - .suppressServiceRequests(false) + .suppressServiceRequests(true) .build(); List faultInjectionRules = new ArrayList<>(); diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index 0a9aaa7761bc..7463677c6a59 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -19,6 +19,7 @@ import com.azure.cosmos.CosmosItemSerializer; import com.azure.cosmos.CosmosOperationPolicy; import com.azure.cosmos.DirectConnectionConfig; +import com.azure.cosmos.Http2ConnectionConfig; import com.azure.cosmos.ReadConsistencyStrategy; import com.azure.cosmos.SessionRetryOptions; import com.azure.cosmos.ThresholdBasedAvailabilityStrategy; @@ -1430,8 +1431,13 @@ private void addUserAgentSuffix(UserAgentContainer userAgentContainer, Set Date: Sun, 14 Sep 2025 19:37:43 -0400 Subject: [PATCH 17/25] Fixing user agent suffix. --- .../azure/cosmos/implementation/RxDocumentClientImpl.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index 7463677c6a59..6127dd97b3c7 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -1431,7 +1431,11 @@ private void addUserAgentSuffix(UserAgentContainer userAgentContainer, Set Date: Wed, 17 Sep 2025 23:16:15 -0400 Subject: [PATCH 18/25] Adjusting tests to accommodate 408-20008 in cases where the main request doesn't satisfy the response in hedging flows. --- ...tionWithAvailabilityStrategyTestsBase.java | 66 ++++++++++++------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/FaultInjectionWithAvailabilityStrategyTestsBase.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/FaultInjectionWithAvailabilityStrategyTestsBase.java index d25ba51cecaf..682fc7b46773 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/FaultInjectionWithAvailabilityStrategyTestsBase.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/FaultInjectionWithAvailabilityStrategyTestsBase.java @@ -2359,6 +2359,15 @@ public Object[][] testConfigs_queryAfterCreation() { } }; + BiConsumer validateCtxUtmostRegions = + (ctx, expectedNumberOfRegionsContacted) -> { + assertThat(ctx).isNotNull(); + if (ctx != null) { + assertThat(ctx.getContactedRegionNames().size()).isGreaterThanOrEqualTo(1); + assertThat(ctx.getContactedRegionNames().size()).isLessThanOrEqualTo(expectedNumberOfRegionsContacted); + } + }; + Consumer validateCtxQueryPlan = (ctx) -> { assertThat(ctx).isNotNull(); @@ -2429,6 +2438,9 @@ public Object[][] testConfigs_queryAfterCreation() { Consumer validateCtxTwoRegions = (ctx) -> validateCtxRegions.accept(ctx, TWO_REGIONS); + Consumer validateCtxUtmostTwoRegions = + (ctx) -> validateCtxUtmostRegions.accept(ctx, TWO_REGIONS); + Consumer validateCtxFirstRegionFailureSecondRegionSuccessfulSingleFeedResponse = (ctx) -> { CosmosDiagnostics[] diagnostics = ctx.getDiagnostics().toArray(new CosmosDiagnostics[0]); assertThat(diagnostics.length).isEqualTo(3); @@ -3270,7 +3282,7 @@ public Object[][] testConfigs_queryAfterCreation() { CosmosDiagnostics[] diagnostics = ctx.getDiagnostics().toArray(new CosmosDiagnostics[0]); // Diagnostics of query attempt in first region not even available yet - assertThat(diagnostics.length).isEqualTo(2); + assertThat(diagnostics.length).isGreaterThanOrEqualTo(2); // query plan on first region assertThat(diagnostics[0].getContactedRegionNames().size()).isEqualTo(1); @@ -3279,13 +3291,13 @@ public Object[][] testConfigs_queryAfterCreation() { (ctx) -> { assertThat(ctx.getDiagnostics()).isNotNull(); CosmosDiagnostics[] diagnostics = ctx.getDiagnostics().toArray(new CosmosDiagnostics[0]); - assertThat(diagnostics[1].getContactedRegionNames().size()).isEqualTo(1); - assertThat(diagnostics[1].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); - assertThat(diagnostics[1].getFeedResponseDiagnostics()).isNotNull(); - assertThat(diagnostics[1].getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); - assertThat(diagnostics[1].getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics[diagnostics.length - 1].getContactedRegionNames().size()).isEqualTo(1); + assertThat(diagnostics[diagnostics.length - 1].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + assertThat(diagnostics[diagnostics.length - 1].getFeedResponseDiagnostics()).isNotNull(); + assertThat(diagnostics[diagnostics.length - 1].getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); + assertThat(diagnostics[diagnostics.length - 1].getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); ClientSideRequestStatistics[] clientStats = - diagnostics[1] + diagnostics[diagnostics.length - 1] .getFeedResponseDiagnostics() .getClientSideRequestStatistics() .toArray(new ClientSideRequestStatistics[0]); @@ -3301,17 +3313,17 @@ public Object[][] testConfigs_queryAfterCreation() { } ), ArrayUtils.toArray( - validateCtxSingleRegion, + validateCtxUtmostTwoRegions, (ctx) -> { assertThat(ctx.getDiagnostics()).isNotNull(); CosmosDiagnostics[] diagnostics = ctx.getDiagnostics().toArray(new CosmosDiagnostics[0]); - assertThat(diagnostics[0].getContactedRegionNames().size()).isEqualTo(1); - assertThat(diagnostics[0].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); - assertThat(diagnostics[0].getFeedResponseDiagnostics()).isNotNull(); - assertThat(diagnostics[0].getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); - assertThat(diagnostics[0].getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics[diagnostics.length - 1].getContactedRegionNames().size()).isEqualTo(1); + assertThat(diagnostics[diagnostics.length - 1].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + assertThat(diagnostics[diagnostics.length - 1].getFeedResponseDiagnostics()).isNotNull(); + assertThat(diagnostics[diagnostics.length - 1].getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); + assertThat(diagnostics[diagnostics.length - 1].getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); ClientSideRequestStatistics[] clientStats = - diagnostics[0] + diagnostics[diagnostics.length - 1] .getFeedResponseDiagnostics() .getClientSideRequestStatistics() .toArray(new ClientSideRequestStatistics[0]); @@ -4067,10 +4079,14 @@ public Object[][] testConfigs_readAllAfterCreation() { for (int i = start; i < diagnostics.length; i++) { CosmosDiagnostics currentDiagnostics = diagnostics[i]; - assertThat(currentDiagnostics.getFeedResponseDiagnostics()).isNotNull(); - assertThat(currentDiagnostics.getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); - assertThat(currentDiagnostics.getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); - assertThat(currentDiagnostics.getFeedResponseDiagnostics().getClientSideRequestStatistics().size()).isGreaterThanOrEqualTo(1); + + if (i != diagnostics.length - 1) { + assertThat(currentDiagnostics.getFeedResponseDiagnostics()).isNull(); + } else { + assertThat(currentDiagnostics.getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); + assertThat(currentDiagnostics.getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); + assertThat(currentDiagnostics.getFeedResponseDiagnostics().getClientSideRequestStatistics().size()).isGreaterThanOrEqualTo(1); + } } } }; @@ -4579,14 +4595,14 @@ public Object[][] testConfigs_readAllAfterCreation() { CosmosDiagnostics[] diagnostics = ctx.getDiagnostics().toArray(new CosmosDiagnostics[0]); // Diagnostics of query attempt in first region not even available yet - assertThat(diagnostics.length).isEqualTo(2); - assertThat(diagnostics[1].getContactedRegionNames().size()).isEqualTo(1); - assertThat(diagnostics[1].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); - assertThat(diagnostics[1].getFeedResponseDiagnostics()).isNotNull(); - assertThat(diagnostics[1].getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); - assertThat(diagnostics[1].getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); + assertThat(diagnostics.length).isGreaterThanOrEqualTo(2); + assertThat(diagnostics[diagnostics.length - 1].getContactedRegionNames().size()).isEqualTo(1); + assertThat(diagnostics[diagnostics.length - 1].getContactedRegionNames().iterator().next()).isEqualTo(SECOND_REGION_NAME); + assertThat(diagnostics[diagnostics.length - 1].getFeedResponseDiagnostics()).isNotNull(); + assertThat(diagnostics[diagnostics.length - 1].getFeedResponseDiagnostics().getQueryMetricsMap()).isNotNull(); + assertThat(diagnostics[diagnostics.length - 1].getFeedResponseDiagnostics().getClientSideRequestStatistics()).isNotNull(); ClientSideRequestStatistics[] clientStats = - diagnostics[1] + diagnostics[diagnostics.length - 1] .getFeedResponseDiagnostics() .getClientSideRequestStatistics() .toArray(new ClientSideRequestStatistics[0]); From 79dc9f26305beb57c0fc666adf910a7a545c58a2 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 18 Sep 2025 10:01:18 -0400 Subject: [PATCH 19/25] Adjust tests to validate user agents suffixed differently due to HTTP/2 opt-in. --- .../azure/cosmos/CosmosDiagnosticsTest.java | 18 +++++++++++------- .../com/azure/cosmos/UserAgentSuffixTest.java | 17 ++++++++++++++--- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/CosmosDiagnosticsTest.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/CosmosDiagnosticsTest.java index c5b7129ea727..5eb5b59867db 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/CosmosDiagnosticsTest.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/CosmosDiagnosticsTest.java @@ -154,7 +154,7 @@ public void beforeClass() { UserAgentContainer userAgentContainer = new UserAgentContainer(); userAgentContainer.setSuffix(USER_AGENT_SUFFIX_GATEWAY_CLIENT); - this.gatewayClientUserAgent = userAgentContainer.getUserAgent(); + this.gatewayClientUserAgent = generateHttp2OptedInUserAgentIfRequired(userAgentContainer.getUserAgent()); directClient = new CosmosClientBuilder() .endpoint(TestConfigurations.HOST) @@ -164,7 +164,7 @@ public void beforeClass() { .directMode() .buildClient(); userAgentContainer.setSuffix(USER_AGENT_SUFFIX_DIRECT_CLIENT); - this.directClientUserAgent = userAgentContainer.getUserAgent(); + this.directClientUserAgent = generateHttp2OptedInUserAgentIfRequired(userAgentContainer.getUserAgent()); cosmosAsyncContainer = getSharedMultiPartitionCosmosContainer(this.gatewayClient.asyncClient()); cosmosAsyncDatabase = directClient.asyncClient().getDatabase(cosmosAsyncContainer.getDatabase().getId()); @@ -271,7 +271,6 @@ public void queryChangeFeedAllVersionsAndDeletes() { FeedResponse response = results.next(); String diagnostics = response.getCosmosDiagnostics().toString(); assertThat(diagnostics).contains("\"connectionMode\":\"GATEWAY\""); - assertThat(diagnostics).contains("\"userAgent\":\"" + this.gatewayClientUserAgent + "\""); assertThat(diagnostics).contains("gatewayStatisticsList"); assertThat(diagnostics).contains("\"operationType\":\"ReadFeed\""); assertThat(diagnostics).contains("\"userAgent\":\"" + this.gatewayClientUserAgent + "\""); @@ -298,7 +297,7 @@ public void queryChangeFeedIncrementalDirectMode() throws Exception { FeedResponse response = results.next(); String diagnostics = response.getCosmosDiagnostics().toString(); assertThat(diagnostics).contains("\"connectionMode\":\"DIRECT\""); - assertThat(diagnostics).contains("\"userAgent\":\"" + this.directClientUserAgent + "\""); + assertThat(diagnostics).contains("\"userAgent\":\"" + generateHttp2OptedInUserAgentIfRequired(this.directClientUserAgent) + "\""); assertThat(diagnostics).contains("\"requestOperationType\":\"ReadFeed\""); } } @@ -324,7 +323,6 @@ public void queryChangeFeedIncrementalGatewayMode() throws Exception { assertThat(diagnostics).contains("\"userAgent\":\"" + this.gatewayClientUserAgent + "\""); assertThat(diagnostics).contains("gatewayStatisticsList"); assertThat(diagnostics).contains("\"operationType\":\"ReadFeed\""); - assertThat(diagnostics).contains("\"userAgent\":\"" + this.gatewayClientUserAgent + "\""); } } @@ -349,7 +347,6 @@ public void gatewayDiagnostics() throws Exception { String diagnostics = createResponse.getDiagnostics().toString(); logger.info("DIAGNOSTICS: {}", diagnostics); assertThat(diagnostics).contains("\"connectionMode\":\"GATEWAY\""); - assertThat(diagnostics).contains("\"userAgent\":\"" + this.gatewayClientUserAgent + "\""); assertThat(diagnostics).contains("gatewayStatisticsList"); assertThat(diagnostics).contains("\"operationType\":\"Create\""); assertThat(diagnostics).contains("\"metaDataName\":\"CONTAINER_LOOK_UP\""); @@ -392,7 +389,6 @@ public void gatewayDiagnosticsOnException() throws Exception { assertThat(diagnostics).contains("gatewayStatisticsList"); assertThat(diagnostics).contains("\"statusCode\":404"); assertThat(diagnostics).contains("\"operationType\":\"Read\""); - assertThat(diagnostics).contains("\"userAgent\":\"" + this.gatewayClientUserAgent + "\""); assertThat(diagnostics).contains("\"exceptionMessage\":\"Entity with the specified id does not exist in the system."); assertThat(diagnostics).contains("\"exceptionResponseHeaders\""); assertThat(diagnostics).doesNotContain("\"exceptionResponseHeaders\": \"{}\""); @@ -1971,6 +1967,14 @@ private void validateChannelAcquisitionContext(CosmosDiagnostics diagnostics, bo } } + private String generateHttp2OptedInUserAgentIfRequired(String userAgent) { + if (Configs.isHttp2Enabled()) { + userAgent = userAgent + "|F10"; + } + + return userAgent; + } + private CosmosDiagnostics performDocumentOperation( CosmosAsyncContainer cosmosAsyncContainer, OperationType operationType, diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/UserAgentSuffixTest.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/UserAgentSuffixTest.java index 9b74cba3ca05..45f02a3641b1 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/UserAgentSuffixTest.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/UserAgentSuffixTest.java @@ -6,6 +6,7 @@ package com.azure.cosmos; +import com.azure.cosmos.implementation.Configs; import com.azure.cosmos.models.CosmosContainerResponse; import com.azure.cosmos.rx.TestSuiteBase; import org.testng.annotations.AfterClass; @@ -54,6 +55,7 @@ public void userAgentSuffixWithoutSpecialCharacter() { assertThat(response.getProperties().getId()).isEqualTo(this.containerName); assertThat(response.getDiagnostics()).isNotNull(); assertThat(response.getDiagnostics().getUserAgent()).endsWith("TestUserAgent"); + validateUserAgentSuffix(response.getDiagnostics().getUserAgent(), "TestUserAgent"); } @Test(groups = { "fast", "emulator" }, timeOut = TIMEOUT) @@ -70,7 +72,7 @@ public void userAgentSuffixWithSpecialCharacter() { assertThat(response.getProperties()).isNotNull(); assertThat(response.getProperties().getId()).isEqualTo(this.containerName); assertThat(response.getDiagnostics()).isNotNull(); - assertThat(response.getDiagnostics().getUserAgent()).endsWith("TestUserAgent's"); + validateUserAgentSuffix(response.getDiagnostics().getUserAgent(), "TestUserAgent's"); } @Test(groups = { "fast", "emulator" }, timeOut = TIMEOUT) @@ -87,7 +89,7 @@ public void userAgentSuffixWithUnicodeCharacter() { assertThat(response.getProperties()).isNotNull(); assertThat(response.getProperties().getId()).isEqualTo(this.containerName); assertThat(response.getDiagnostics()).isNotNull(); - assertThat(response.getDiagnostics().getUserAgent()).endsWith("UnicodeChar_InUserAgent"); + validateUserAgentSuffix(response.getDiagnostics().getUserAgent(), "UnicodeChar_InUserAgent"); } @Test(groups = { "fast", "emulator" }, timeOut = TIMEOUT) @@ -104,6 +106,15 @@ public void userAgentSuffixWithWhitespaceAndAsciiSpecialChars() { assertThat(response.getProperties()).isNotNull(); assertThat(response.getProperties().getId()).isEqualTo(this.containerName); assertThat(response.getDiagnostics()).isNotNull(); - assertThat(response.getDiagnostics().getUserAgent()).endsWith("UserAgent with space$%_^()*&"); + validateUserAgentSuffix(response.getDiagnostics().getUserAgent(), "UserAgent with space$%_^()*&"); + } + + private void validateUserAgentSuffix(String actualUserAgent, String expectedUserAgentSuffix) { + + if (Configs.isHttp2Enabled()) { + expectedUserAgentSuffix = expectedUserAgentSuffix + "|F10"; + } + + assertThat(actualUserAgent).endsWith(expectedUserAgentSuffix); } } From 03ab8d7721a1b1af483dc1f661ee81e23bb845ae Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Mon, 22 Sep 2025 10:07:51 -0400 Subject: [PATCH 20/25] Fixing live tests. --- .../src/test/java/com/azure/cosmos/UserAgentSuffixTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/UserAgentSuffixTest.java b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/UserAgentSuffixTest.java index 45f02a3641b1..d91b97327f80 100644 --- a/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/UserAgentSuffixTest.java +++ b/sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/UserAgentSuffixTest.java @@ -54,7 +54,6 @@ public void userAgentSuffixWithoutSpecialCharacter() { assertThat(response.getProperties()).isNotNull(); assertThat(response.getProperties().getId()).isEqualTo(this.containerName); assertThat(response.getDiagnostics()).isNotNull(); - assertThat(response.getDiagnostics().getUserAgent()).endsWith("TestUserAgent"); validateUserAgentSuffix(response.getDiagnostics().getUserAgent(), "TestUserAgent"); } From 6d646a2b39b95fab70fe335d3fc4b6837409b281 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Mon, 22 Sep 2025 10:13:37 -0400 Subject: [PATCH 21/25] Fixing `isPpafEnabled` flag in `CosmosDiagnostics`. --- .../azure/cosmos/implementation/DiagnosticsClientContext.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java index 56b28af1063f..05360d332d51 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java @@ -132,7 +132,7 @@ class DiagnosticsClientConfig { private String sessionRetryOptionsAsString; private String regionScopedSessionContainerOptionsAsString; private String partitionLevelCircuitBreakerConfigAsString; - private String isPerPartitionAutomaticFailoverEnabledAsString; + private String isPerPartitionAutomaticFailoverEnabledAsString = "false"; public DiagnosticsClientConfig withMachineId(String machineId) { this.machineId = machineId; From 18300abed68102827fd3c69e9a358ade870dc151 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Mon, 22 Sep 2025 10:18:51 -0400 Subject: [PATCH 22/25] Updated CHANGELOG.md --- sdk/cosmos/azure-cosmos/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/cosmos/azure-cosmos/CHANGELOG.md b/sdk/cosmos/azure-cosmos/CHANGELOG.md index ea56ba5dec60..b622dbc7b114 100644 --- a/sdk/cosmos/azure-cosmos/CHANGELOG.md +++ b/sdk/cosmos/azure-cosmos/CHANGELOG.md @@ -3,7 +3,7 @@ ### 4.75.0-beta.1 (Unreleased) #### Features Added -* Enabled `CosmosClient` to support per-partition automatic failover dynamically without the need to restart the application. - See [PR 46477](https://github.com/Azure/azure-sdk-for-java/pull/46477) +* Enabled `Cosmos(Async)Client` to support per-partition automatic failover dynamically without the need to restart the application. - See [PR 46477](https://github.com/Azure/azure-sdk-for-java/pull/46477) #### Breaking Changes From b066cf14f713d4c1b138af47027ac5726da53eda Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Mon, 22 Sep 2025 17:49:02 -0400 Subject: [PATCH 23/25] Attempt to fix memory leak. --- .../com/azure/cosmos/implementation/GlobalEndpointManager.java | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java index 6233987130b5..97868dfb2d98 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java @@ -190,6 +190,7 @@ public boolean canUseMultipleWriteLocations(RxDocumentServiceRequest request) { public void close() { this.isClosed = true; + this.perPartitionAutomaticFailoverConfigModifier = null; this.scheduler.dispose(); logger.debug("GlobalEndpointManager closed."); } From 29cbe775c46e0e96b8e1e9df012a4f66d2817e97 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Mon, 22 Sep 2025 23:43:35 -0400 Subject: [PATCH 24/25] Attempt to fix memory leak. --- .../com/azure/cosmos/implementation/RxDocumentClientImpl.java | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index 35e92094e0ec..e1de11625f6b 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -6453,6 +6453,7 @@ public void close() { this.throughputControlStore.close(); } + this.perPartitionFailoverConfigModifier = null; logger.info("Shutting down completed."); } else { logger.warn("Already shutdown!"); From f10e3d162a66480fbd083787bb342f5350300b55 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Tue, 30 Sep 2025 21:05:48 -0400 Subject: [PATCH 25/25] Addressing review comments. --- .../DiagnosticsClientContext.java | 10 ++----- .../implementation/GlobalEndpointManager.java | 30 ++++++++++--------- .../implementation/RxDocumentClientImpl.java | 22 +++++++++----- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java index 05360d332d51..d540bf33a082 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/DiagnosticsClientContext.java @@ -255,14 +255,8 @@ public DiagnosticsClientConfig withPartitionLevelCircuitBreakerConfig(PartitionL return this; } - public DiagnosticsClientConfig withIsPerPartitionAutomaticFailoverEnabled(boolean isPpafEnabled) { - - if (isPpafEnabled) { - this.isPerPartitionAutomaticFailoverEnabledAsString = "true"; - } else { - this.isPerPartitionAutomaticFailoverEnabledAsString = "false"; - } - + public DiagnosticsClientConfig withIsPerPartitionAutomaticFailoverEnabled(Boolean isPpafEnabled) { + this.isPerPartitionAutomaticFailoverEnabledAsString = (isPpafEnabled != null && isPpafEnabled) ? "true" : "false"; return this; } diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java index 97868dfb2d98..dd464f48a15e 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/GlobalEndpointManager.java @@ -24,6 +24,7 @@ import java.util.Objects; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.Consumer; import java.util.function.Function; /** @@ -55,7 +56,7 @@ public class GlobalEndpointManager implements AutoCloseable { private volatile Throwable latestDatabaseRefreshError; - private volatile Function perPartitionAutomaticFailoverConfigModifier; + private volatile Consumer perPartitionAutomaticFailoverConfigModifier; public void setLatestDatabaseRefreshError(Throwable latestDatabaseRefreshError) { this.latestDatabaseRefreshError = latestDatabaseRefreshError; @@ -370,13 +371,19 @@ private Mono getDatabaseAccountAsync(URI serviceEndpoint) { databaseAccount.getThinClientReadableLocations(); this.hasThinClientReadLocations.set(thinClientReadLocations != null && !thinClientReadLocations.isEmpty()); - if (this.hasPerPartitionAutomaticFailoverConfigChanged(databaseAccount)) { - Boolean currentPerPartitionAutomaticFailoverEnabledFromService = databaseAccount.isPerPartitionFailoverBehaviorEnabled(); - this.lastRecordedPerPartitionAutomaticFailoverEnabledOnClient.set(Boolean.TRUE.equals(currentPerPartitionAutomaticFailoverEnabledFromService)); - - if (this.perPartitionAutomaticFailoverConfigModifier != null) { - logger.warn("Per partition automatic failover enabled: {}, applying modifier", currentPerPartitionAutomaticFailoverEnabledFromService); - this.perPartitionAutomaticFailoverConfigModifier.apply(databaseAccount); + Boolean currentPerPartitionAutomaticFailoverEnabledFromService = + databaseAccount.isPerPartitionFailoverBehaviorEnabled(); + + if (currentPerPartitionAutomaticFailoverEnabledFromService != null) { + boolean newVal = currentPerPartitionAutomaticFailoverEnabledFromService; + // Attempt to flip only if the value actually changes. + if (this.lastRecordedPerPartitionAutomaticFailoverEnabledOnClient + .compareAndSet(!newVal, newVal)) { + if (this.perPartitionAutomaticFailoverConfigModifier != null) { + logger.info("ATTN: Per partition automatic failover enabled: {}, applying modifier", + currentPerPartitionAutomaticFailoverEnabledFromService); + this.perPartitionAutomaticFailoverConfigModifier.accept(databaseAccount); + } } } @@ -427,12 +434,7 @@ private List getEffectivePreferredRegions() { } } - public void setPerPartitionAutomaticFailoverConfigModifier(Function perPartitionAutomaticFailoverConfigModifier) { + public void setPerPartitionAutomaticFailoverConfigModifier(Consumer perPartitionAutomaticFailoverConfigModifier) { this.perPartitionAutomaticFailoverConfigModifier = perPartitionAutomaticFailoverConfigModifier; } - - private boolean hasPerPartitionAutomaticFailoverConfigChanged(DatabaseAccount databaseAccount) { - Boolean currentPerPartitionAutomaticFailoverEnabledFromService = databaseAccount.isPerPartitionFailoverBehaviorEnabled(); - return currentPerPartitionAutomaticFailoverEnabledFromService != null && !Objects.equals(currentPerPartitionAutomaticFailoverEnabledFromService, this.lastRecordedPerPartitionAutomaticFailoverEnabledOnClient.get()); - } } diff --git a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java index e1de11625f6b..cbcf3a4943bd 100644 --- a/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java +++ b/sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/RxDocumentClientImpl.java @@ -279,7 +279,7 @@ public class RxDocumentClientImpl implements AsyncDocumentClient, IAuthorization private List operationPolicies; private final AtomicReference cachedCosmosAsyncClientSnapshot; private CosmosEndToEndOperationLatencyPolicyConfig ppafEnforcedE2ELatencyPolicyConfigForReads; - private Function perPartitionFailoverConfigModifier; + private Consumer perPartitionFailoverConfigModifier; public RxDocumentClientImpl(URI serviceEndpoint, String masterKeyOrResourceToken, @@ -798,7 +798,6 @@ public void init(CosmosClientMetadataCachesSnapshot metadataCachesSnapshot, Func = (databaseAccount -> { this.initializePerPartitionFailover(databaseAccount); this.addUserAgentSuffix(this.userAgentContainer, EnumSet.allOf(UserAgentFeatureFlags.class)); - return null; }); this.globalEndpointManager.setPerPartitionAutomaticFailoverConfigModifier(this.perPartitionFailoverConfigModifier); @@ -1490,7 +1489,14 @@ private void addUserAgentSuffix(UserAgentContainer userAgentContainer, Set