Skip to content

Commit 2b2e96f

Browse files
authored
chore(ci): Improve reliability of retries in TracingE2ET (#2018)
* Expand timewindow for X-RAY trace summary search during E2E tests. * Add retries for trace subsegments as well. * Add log.debug for subsegement population. * Add retries for custom subsegment population. * Add longer retries for TracingE2ET and capability to pass custom retry config to RetryUtils. * Add 1 minute padding around MetricsFetcher. * Revert 1 minute time padding for MetricsFetcher. * Add retry loop around datapoint fetching for first metrics datapoints.
1 parent ee995f7 commit 2b2e96f

File tree

4 files changed

+103
-19
lines changed

4 files changed

+103
-19
lines changed

powertools-e2e-tests/pom.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,16 @@
150150
<version>2.4</version>
151151
<scope>test</scope>
152152
</dependency>
153+
<dependency>
154+
<groupId>com.fasterxml.jackson.core</groupId>
155+
<artifactId>jackson-databind</artifactId>
156+
<scope>test</scope>
157+
</dependency>
158+
<dependency>
159+
<groupId>com.fasterxml.jackson.datatype</groupId>
160+
<artifactId>jackson-datatype-jsr310</artifactId>
161+
<scope>test</scope>
162+
</dependency>
153163
<dependency>
154164
<groupId>org.aspectj</groupId>
155165
<artifactId>aspectjrt</artifactId>

powertools-e2e-tests/src/test/java/software/amazon/lambda/powertools/MetricsE2ET.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434
import org.junit.jupiter.api.Test;
3535
import org.junit.jupiter.api.Timeout;
3636

37+
import software.amazon.lambda.powertools.testutils.DataNotReadyException;
3738
import software.amazon.lambda.powertools.testutils.Infrastructure;
39+
import software.amazon.lambda.powertools.testutils.RetryUtils;
3840
import software.amazon.lambda.powertools.testutils.lambda.InvocationResult;
3941
import software.amazon.lambda.powertools.testutils.metrics.MetricsFetcher;
4042

@@ -89,16 +91,20 @@ void test_recordMetrics() {
8991
{ "FunctionName", functionName },
9092
{ "Service", SERVICE } }).collect(Collectors.toMap(data -> data[0], data -> data[1])));
9193
assertThat(coldStart.get(0)).isEqualTo(1);
92-
List<Double> orderMetrics = metricsFetcher.fetchMetrics(invocationResult.getStart(), invocationResult.getEnd(),
93-
60, NAMESPACE,
94-
"orders", Collections.singletonMap("Environment", "test"));
94+
List<Double> orderMetrics = RetryUtils.withRetry(() -> {
95+
List<Double> metrics = metricsFetcher.fetchMetrics(invocationResult.getStart(), invocationResult.getEnd(),
96+
60, NAMESPACE, "orders", Collections.singletonMap("Environment", "test"));
97+
if (metrics.get(0) != 2.0) {
98+
throw new DataNotReadyException("Expected 2.0 orders but got " + metrics.get(0));
99+
}
100+
return metrics;
101+
}, "orderMetricsRetry", DataNotReadyException.class).get();
95102
assertThat(orderMetrics.get(0)).isEqualTo(2);
96103
List<Double> productMetrics = metricsFetcher.fetchMetrics(invocationResult.getStart(),
97104
invocationResult.getEnd(), 60, NAMESPACE,
98105
"products", Collections.singletonMap("Environment", "test"));
99106

100107
// When searching across a 1 minute time period with a period of 60 we find both metrics and the sum is 12
101-
102108
assertThat(productMetrics.get(0)).isEqualTo(12);
103109

104110
orderMetrics = metricsFetcher.fetchMetrics(invocationResult.getStart(), invocationResult.getEnd(), 60,

powertools-e2e-tests/src/test/java/software/amazon/lambda/powertools/testutils/RetryUtils.java

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,21 @@ private RetryUtils() {
4747
*/
4848
@SafeVarargs
4949
public static Retry createRetry(String name, Class<? extends Throwable>... retryOnThrowables) {
50-
RetryConfig config = RetryConfig.from(DEFAULT_RETRY_CONFIG)
50+
return createRetry(name, DEFAULT_RETRY_CONFIG, retryOnThrowables);
51+
}
52+
53+
/**
54+
* Creates a retry instance with custom configuration for the specified throwable types.
55+
*
56+
* @param name the name for the retry instance
57+
* @param customConfig the custom retry configuration
58+
* @param retryOnThrowables the throwable classes to retry on
59+
* @return configured Retry instance
60+
*/
61+
@SafeVarargs
62+
public static Retry createRetry(String name, RetryConfig customConfig,
63+
Class<? extends Throwable>... retryOnThrowables) {
64+
RetryConfig config = RetryConfig.from(customConfig)
5165
.retryExceptions(retryOnThrowables)
5266
.build();
5367

@@ -72,4 +86,20 @@ public static <T> Supplier<T> withRetry(Supplier<T> supplier, String name,
7286
Retry retry = createRetry(name, retryOnThrowables);
7387
return Retry.decorateSupplier(retry, supplier);
7488
}
89+
90+
/**
91+
* Decorates a supplier with custom retry logic for the specified throwable types.
92+
*
93+
* @param supplier the supplier to decorate
94+
* @param name the name for the retry instance
95+
* @param customConfig the custom retry configuration
96+
* @param retryOnThrowables the throwable classes to retry on
97+
* @return decorated supplier with retry logic
98+
*/
99+
@SafeVarargs
100+
public static <T> Supplier<T> withRetry(Supplier<T> supplier, String name, RetryConfig customConfig,
101+
Class<? extends Throwable>... retryOnThrowables) {
102+
Retry retry = createRetry(name, customConfig, retryOnThrowables);
103+
return Retry.decorateSupplier(retry, supplier);
104+
}
75105
}

powertools-e2e-tests/src/test/java/software/amazon/lambda/powertools/testutils/tracing/TraceFetcher.java

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
package software.amazon.lambda.powertools.testutils.tracing;
1616

17+
import java.time.Duration;
1718
import java.time.Instant;
1819
import java.time.temporal.ChronoUnit;
1920
import java.util.Arrays;
@@ -27,8 +28,11 @@
2728

2829
import com.fasterxml.jackson.core.JsonProcessingException;
2930
import com.fasterxml.jackson.databind.DeserializationFeature;
31+
import com.fasterxml.jackson.databind.MapperFeature;
3032
import com.fasterxml.jackson.databind.ObjectMapper;
33+
import com.fasterxml.jackson.databind.json.JsonMapper;
3134

35+
import io.github.resilience4j.retry.RetryConfig;
3236
import software.amazon.awssdk.http.SdkHttpClient;
3337
import software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient;
3438
import software.amazon.awssdk.regions.Region;
@@ -46,9 +50,10 @@
4650
* Class in charge of retrieving the actual traces of a Lambda execution on X-Ray
4751
*/
4852
public class TraceFetcher {
49-
50-
private static final ObjectMapper MAPPER = new ObjectMapper()
51-
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
53+
private static final ObjectMapper MAPPER = JsonMapper.builder()
54+
.disable(MapperFeature.REQUIRE_HANDLERS_FOR_JAVA8_TIMES)
55+
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
56+
.build();
5257
private static final Logger LOG = LoggerFactory.getLogger(TraceFetcher.class);
5358
private static final SdkHttpClient httpClient = UrlConnectionHttpClient.builder().build();
5459
private static final Region region = Region.of(System.getProperty("AWS_DEFAULT_REGION", "eu-west-1"));
@@ -90,7 +95,12 @@ public Trace fetchTrace() {
9095
return getTrace(traceIds);
9196
};
9297

93-
return RetryUtils.withRetry(supplier, "trace-fetcher", TraceNotFoundException.class).get();
98+
RetryConfig customConfig = RetryConfig.custom()
99+
.maxAttempts(120) // 120 attempts over 10 minutes
100+
.waitDuration(Duration.ofSeconds(5)) // 5 seconds between attempts
101+
.build();
102+
103+
return RetryUtils.withRetry(supplier, "trace-fetcher", customConfig, TraceNotFoundException.class).get();
94104
}
95105

96106
/**
@@ -104,26 +114,41 @@ private Trace getTrace(List<String> traceIds) {
104114
.traceIds(traceIds)
105115
.build());
106116
if (!tracesResponse.hasTraces()) {
107-
throw new TraceNotFoundException("No trace found");
117+
throw new TraceNotFoundException(String.format("No trace found for traceIds %s", traceIds));
108118
}
119+
109120
Trace traceRes = new Trace();
110121
tracesResponse.traces().forEach(trace -> {
111122
if (trace.hasSegments()) {
112123
trace.segments().forEach(segment -> {
113124
try {
114125
SegmentDocument document = MAPPER.readValue(segment.document(), SegmentDocument.class);
115126
if ("AWS::Lambda::Function".equals(document.getOrigin()) && document.hasSubsegments()) {
116-
getNestedSubSegments(document.getSubsegments(), traceRes,
117-
Collections.emptyList());
127+
LOG.debug("Populating subsegments for document {}", MAPPER.writeValueAsString(document));
128+
getNestedSubSegments(document.getSubsegments(), traceRes, Collections.emptyList());
129+
// If only the default (excluded) subsegments were populated we need to keep retrying for
130+
// our custom subsegments. They might appear later.
131+
if (traceRes.getSubsegments().isEmpty()) {
132+
throw new TraceNotFoundException(
133+
"Found AWS::Lambda::Function SegmentDocument with no non-excluded subsegments.");
134+
}
135+
} else if ("AWS::Lambda::Function".equals(document.getOrigin())) {
136+
LOG.debug(
137+
"Found AWS::Lambda::Function SegmentDocument with no subsegments. Retrying {}",
138+
MAPPER.writeValueAsString(document));
139+
throw new TraceNotFoundException(
140+
"Found AWS::Lambda::Function SegmentDocument with no subsegments.");
118141
}
119-
120142
} catch (JsonProcessingException e) {
121143
LOG.error("Failed to parse segment document: " + e.getMessage());
122144
throw new RuntimeException(e);
123145
}
124146
});
147+
} else {
148+
throw new TraceNotFoundException(String.format("No segments found in trace %s", trace.id()));
125149
}
126150
});
151+
127152
return traceRes;
128153
}
129154

@@ -149,21 +174,30 @@ private void getNestedSubSegments(List<SubSegment> subsegments, Trace traceRes,
149174
* @return a list of trace ids
150175
*/
151176
private List<String> getTraceIds() {
177+
LOG.debug("Searching for traces from {} to {} with filter: {}", start, end, filterExpression);
152178
GetTraceSummariesResponse traceSummaries = xray.getTraceSummaries(GetTraceSummariesRequest.builder()
153179
.startTime(start)
154180
.endTime(end)
155-
.timeRangeType(TimeRangeType.EVENT)
181+
.timeRangeType(TimeRangeType.TRACE_ID)
156182
.sampling(false)
157183
.filterExpression(filterExpression)
158184
.build());
185+
186+
LOG.debug("Found {} trace summaries",
187+
traceSummaries.hasTraceSummaries() ? traceSummaries.traceSummaries().size() : 0);
188+
159189
if (!traceSummaries.hasTraceSummaries()) {
160-
throw new TraceNotFoundException("No trace id found");
190+
throw new TraceNotFoundException(String.format("No trace id found for filter '%s' between %s and %s",
191+
filterExpression, start, end));
161192
}
162193
List<String> traceIds = traceSummaries.traceSummaries().stream().map(TraceSummary::id)
163194
.collect(Collectors.toList());
164195
if (traceIds.isEmpty()) {
165-
throw new TraceNotFoundException("No trace id found");
196+
throw new TraceNotFoundException(
197+
String.format("Empty trace summary found for filter '%s' between %s and %s",
198+
filterExpression, start, end));
166199
}
200+
LOG.debug("Found trace IDs: {}", traceIds);
167201
return traceIds;
168202
}
169203

@@ -183,9 +217,13 @@ public TraceFetcher build() {
183217
if (end == null) {
184218
end = start.plus(1, ChronoUnit.MINUTES);
185219
}
186-
LOG.debug("Looking for traces from {} to {} with filter {} and excluded segments {}", start, end,
187-
filterExpression, excludedSegments);
188-
return new TraceFetcher(start, end, filterExpression, excludedSegments);
220+
// Expand search window by 1 minute on each side to account for timing imprecisions
221+
Instant expandedStart = start.minus(1, ChronoUnit.MINUTES);
222+
Instant expandedEnd = end.plus(1, ChronoUnit.MINUTES);
223+
LOG.debug(
224+
"Looking for traces from {} to {} (expanded from {} to {}) with filter {} and excluded segments {}",
225+
expandedStart, expandedEnd, start, end, filterExpression, excludedSegments);
226+
return new TraceFetcher(expandedStart, expandedEnd, filterExpression, excludedSegments);
189227
}
190228

191229
public Builder start(Instant start) {

0 commit comments

Comments
 (0)