Skip to content

Commit ca36dd8

Browse files
committed
use freshness as max tiering duration and mark forced finished table as pending again
1 parent 38222c2 commit ca36dd8

File tree

12 files changed

+260
-102
lines changed

12 files changed

+260
-102
lines changed

fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/tiering/LakeTieringJobBuilder.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,6 @@
1717

1818
package org.apache.fluss.flink.tiering;
1919

20-
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
21-
import org.apache.flink.configuration.PipelineOptions;
22-
import org.apache.flink.core.execution.JobClient;
23-
import org.apache.flink.streaming.api.datastream.DataStreamSource;
24-
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
25-
import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink;
2620
import org.apache.fluss.config.Configuration;
2721
import org.apache.fluss.flink.tiering.committer.CommittableMessageTypeInfo;
2822
import org.apache.fluss.flink.tiering.committer.TieringCommitOperatorFactory;
@@ -33,6 +27,13 @@
3327
import org.apache.fluss.lake.lakestorage.LakeStoragePluginSetUp;
3428
import org.apache.fluss.lake.writer.LakeTieringFactory;
3529

30+
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
31+
import org.apache.flink.configuration.PipelineOptions;
32+
import org.apache.flink.core.execution.JobClient;
33+
import org.apache.flink.streaming.api.datastream.DataStreamSource;
34+
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
35+
import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink;
36+
3637
import static org.apache.fluss.flink.tiering.source.TieringSource.TIERING_SOURCE_TRANSFORMATION_UID;
3738
import static org.apache.fluss.flink.tiering.source.TieringSourceOptions.POLL_TIERING_TABLE_INTERVAL;
3839
import static org.apache.fluss.utils.Preconditions.checkNotNull;

fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/tiering/source/TieringSource.java

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,6 @@
1717

1818
package org.apache.fluss.flink.tiering.source;
1919

20-
import org.apache.flink.api.connector.source.Boundedness;
21-
import org.apache.flink.api.connector.source.Source;
22-
import org.apache.flink.api.connector.source.SourceReader;
23-
import org.apache.flink.api.connector.source.SourceReaderContext;
24-
import org.apache.flink.api.connector.source.SplitEnumerator;
25-
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
26-
import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds;
27-
import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue;
28-
import org.apache.flink.core.io.SimpleVersionedSerializer;
29-
import org.apache.flink.runtime.jobgraph.OperatorID;
30-
import org.apache.flink.streaming.api.graph.StreamGraphHasherV2;
3120
import org.apache.fluss.client.Connection;
3221
import org.apache.fluss.client.ConnectionFactory;
3322
import org.apache.fluss.config.Configuration;
@@ -41,6 +30,18 @@
4130
import org.apache.fluss.shaded.guava32.com.google.common.hash.Hasher;
4231
import org.apache.fluss.shaded.guava32.com.google.common.hash.Hashing;
4332

33+
import org.apache.flink.api.connector.source.Boundedness;
34+
import org.apache.flink.api.connector.source.Source;
35+
import org.apache.flink.api.connector.source.SourceReader;
36+
import org.apache.flink.api.connector.source.SourceReaderContext;
37+
import org.apache.flink.api.connector.source.SplitEnumerator;
38+
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
39+
import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds;
40+
import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue;
41+
import org.apache.flink.core.io.SimpleVersionedSerializer;
42+
import org.apache.flink.runtime.jobgraph.OperatorID;
43+
import org.apache.flink.streaming.api.graph.StreamGraphHasherV2;
44+
4445
import java.nio.charset.StandardCharsets;
4546

4647
import static org.apache.fluss.flink.tiering.source.TieringSourceOptions.POLL_TIERING_TABLE_INTERVAL;

fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/tiering/source/TieringSourceFetcherManager.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,15 @@
1818

1919
package org.apache.fluss.flink.tiering.source;
2020

21+
import org.apache.fluss.flink.adapter.SingleThreadFetcherManagerAdapter;
22+
import org.apache.fluss.flink.tiering.source.split.TieringSplit;
23+
2124
import org.apache.flink.configuration.Configuration;
2225
import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds;
2326
import org.apache.flink.connector.base.source.reader.fetcher.SplitFetcher;
2427
import org.apache.flink.connector.base.source.reader.fetcher.SplitFetcherTask;
2528
import org.apache.flink.connector.base.source.reader.splitreader.SplitReader;
2629
import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue;
27-
import org.apache.fluss.flink.adapter.SingleThreadFetcherManagerAdapter;
28-
import org.apache.fluss.flink.tiering.source.split.TieringSplit;
2930

3031
import java.util.Collection;
3132
import java.util.function.Consumer;

fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/tiering/source/TieringSplitReader.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@
1717

1818
package org.apache.fluss.flink.tiering.source;
1919

20-
import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds;
21-
import org.apache.flink.connector.base.source.reader.splitreader.SplitReader;
22-
import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition;
23-
import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange;
2420
import org.apache.fluss.annotation.VisibleForTesting;
2521
import org.apache.fluss.client.Connection;
2622
import org.apache.fluss.client.table.Table;
@@ -38,10 +34,16 @@
3834
import org.apache.fluss.metadata.TableInfo;
3935
import org.apache.fluss.metadata.TablePath;
4036
import org.apache.fluss.utils.CloseableIterator;
37+
38+
import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds;
39+
import org.apache.flink.connector.base.source.reader.splitreader.SplitReader;
40+
import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition;
41+
import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange;
4142
import org.slf4j.Logger;
4243
import org.slf4j.LoggerFactory;
4344

4445
import javax.annotation.Nullable;
46+
4547
import java.io.IOException;
4648
import java.time.Duration;
4749
import java.util.ArrayDeque;

fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/tiering/source/enumerator/TieringSourceEnumerator.java

Lines changed: 78 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,6 @@
1717

1818
package org.apache.fluss.flink.tiering.source.enumerator;
1919

20-
import org.apache.flink.api.connector.source.ReaderInfo;
21-
import org.apache.flink.api.connector.source.SourceEvent;
22-
import org.apache.flink.api.connector.source.SplitEnumerator;
23-
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
24-
import org.apache.flink.api.java.tuple.Tuple3;
25-
import org.apache.flink.metrics.groups.SplitEnumeratorMetricGroup;
26-
import org.apache.flink.util.FlinkRuntimeException;
2720
import org.apache.fluss.annotation.VisibleForTesting;
2821
import org.apache.fluss.client.Connection;
2922
import org.apache.fluss.client.ConnectionFactory;
@@ -48,10 +41,19 @@
4841
import org.apache.fluss.rpc.messages.PbLakeTieringTableInfo;
4942
import org.apache.fluss.rpc.metrics.ClientMetricGroup;
5043
import org.apache.fluss.utils.MapUtils;
44+
45+
import org.apache.flink.api.connector.source.ReaderInfo;
46+
import org.apache.flink.api.connector.source.SourceEvent;
47+
import org.apache.flink.api.connector.source.SplitEnumerator;
48+
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
49+
import org.apache.flink.api.java.tuple.Tuple3;
50+
import org.apache.flink.metrics.groups.SplitEnumeratorMetricGroup;
51+
import org.apache.flink.util.FlinkRuntimeException;
5152
import org.slf4j.Logger;
5253
import org.slf4j.LoggerFactory;
5354

5455
import javax.annotation.Nullable;
56+
5557
import java.io.IOException;
5658
import java.util.ArrayList;
5759
import java.util.Collections;
@@ -102,7 +104,7 @@ public class TieringSourceEnumerator
102104

103105
private final Map<Long, Long> tieringTableEpochs;
104106
private final Map<Long, Long> failedTableEpochs;
105-
private final Map<Long, Long> finishedTableEpochs;
107+
private final Map<Long, TieringFinishState> finishedTables;
106108
private final Set<Long> tieringReachMaxDurationsTables;
107109

108110
// lazily instantiated
@@ -131,7 +133,7 @@ public TieringSourceEnumerator(
131133
this.pendingSplits = Collections.synchronizedList(new ArrayList<>());
132134
this.readersAwaitingSplit = Collections.synchronizedSet(new TreeSet<>());
133135
this.tieringTableEpochs = MapUtils.newConcurrentHashMap();
134-
this.finishedTableEpochs = MapUtils.newConcurrentHashMap();
136+
this.finishedTables = MapUtils.newConcurrentHashMap();
135137
this.failedTableEpochs = MapUtils.newConcurrentHashMap();
136138
this.tieringReachMaxDurationsTables = Collections.synchronizedSet(new TreeSet<>());
137139
}
@@ -179,8 +181,24 @@ public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname
179181
}
180182
LOG.info("TieringSourceReader {} requests split.", subtaskId);
181183
readersAwaitingSplit.add(subtaskId);
182-
this.context.callAsync(
183-
this::requestTieringTableSplitsViaHeartBeat, this::generateAndAssignSplits);
184+
185+
// If pending splits exist, assign them directly to the requesting reader
186+
if (!pendingSplits.isEmpty()) {
187+
assignSplits();
188+
} else {
189+
// Note: Ideally, only one table should be tiering at a time.
190+
// Here we block to request a tiering table synchronously to avoid multiple threads
191+
// requesting tiering tables concurrently, which would cause the enumerator to contain
192+
// multiple tiering tables simultaneously. This is not optimal for tiering performance.
193+
Tuple3<Long, Long, TablePath> tieringTable = null;
194+
Throwable throwable = null;
195+
try {
196+
tieringTable = this.requestTieringTableSplitsViaHeartBeat();
197+
} catch (Throwable t) {
198+
throwable = t;
199+
}
200+
this.generateAndAssignSplits(tieringTable, throwable);
201+
}
184202
}
185203

186204
@Override
@@ -252,7 +270,9 @@ public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) {
252270
"The finished table {} is not in tiering table, won't report it to Fluss to mark as finished.",
253271
finishedTableId);
254272
} else {
255-
finishedTableEpochs.put(finishedTableId, tieringEpoch);
273+
boolean isForceComplete = tieringReachMaxDurationsTables.remove(finishedTableId);
274+
finishedTables.put(
275+
finishedTableId, TieringFinishState.from(tieringEpoch, isForceComplete));
256276
}
257277
}
258278

@@ -274,7 +294,7 @@ public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) {
274294
}
275295
}
276296

277-
if (!finishedTableEpochs.isEmpty() || !failedTableEpochs.isEmpty()) {
297+
if (!finishedTables.isEmpty() || !failedTableEpochs.isEmpty()) {
278298
// call one round of heartbeat to notify table has been finished or failed
279299
this.context.callAsync(
280300
this::requestTieringTableSplitsViaHeartBeat, this::generateAndAssignSplits);
@@ -288,6 +308,7 @@ private void handleSourceReaderFailOver() {
288308
// we need to make all as failed
289309
failedTableEpochs.putAll(new HashMap<>(tieringTableEpochs));
290310
tieringTableEpochs.clear();
311+
tieringReachMaxDurationsTables.clear();
291312
// also clean all pending splits since we mark all as failed
292313
pendingSplits.clear();
293314
if (!failedTableEpochs.isEmpty()) {
@@ -298,22 +319,18 @@ private void handleSourceReaderFailOver() {
298319
}
299320

300321
@VisibleForTesting
301-
protected void handleTableTieringReachMaxDuration(long tableId, long tieringEpoch) {
322+
protected void handleTableTieringReachMaxDuration(
323+
TablePath tablePath, long tableId, long tieringEpoch) {
302324
Long currentEpoch = tieringTableEpochs.get(tableId);
303325
if (currentEpoch != null && currentEpoch.equals(tieringEpoch)) {
304-
LOG.info("Table {} reached max duration. Force completing.", tableId);
326+
LOG.info("Table {}-{} reached max duration. Force completing.", tablePath, tableId);
305327
tieringReachMaxDurationsTables.add(tableId);
306328

307329
for (TieringSplit tieringSplit : pendingSplits) {
308330
if (tieringSplit.getTableBucket().getTableId() == tableId) {
309331
// mark this tiering split to skip the current round since the tiering for
310332
// this table has timed out, so the tiering source reader can skip them directly
311333
tieringSplit.skipCurrentRound();
312-
} else {
313-
// we can break directly, if found any one split's table id is not equal to the
314-
// timeout
315-
// table, the following split must be not equal to the table id
316-
break;
317334
}
318335
}
319336

@@ -362,13 +379,13 @@ private void assignSplits() {
362379
if (closed) {
363380
return null;
364381
}
365-
Map<Long, Long> currentFinishedTableEpochs = new HashMap<>(this.finishedTableEpochs);
382+
Map<Long, TieringFinishState> currentFinishedTables = new HashMap<>(this.finishedTables);
366383
Map<Long, Long> currentFailedTableEpochs = new HashMap<>(this.failedTableEpochs);
367384
LakeTieringHeartbeatRequest tieringHeartbeatRequest =
368385
tieringTableHeartBeat(
369386
basicHeartBeat(),
370387
this.tieringTableEpochs,
371-
currentFinishedTableEpochs,
388+
currentFinishedTables,
372389
currentFailedTableEpochs,
373390
this.flussCoordinatorEpoch);
374391

@@ -397,9 +414,9 @@ private void assignSplits() {
397414
waitHeartbeatResponse(coordinatorGateway.lakeTieringHeartbeat(tieringHeartbeatRequest));
398415
}
399416

400-
// if come to here, we can remove currentFinishedTableEpochs/failedTableEpochs to avoid send
417+
// if come to here, we can remove currentFinishedTables/failedTableEpochs to avoid send
401418
// in next round
402-
currentFinishedTableEpochs.forEach(finishedTableEpochs::remove);
419+
currentFinishedTables.forEach(finishedTables::remove);
403420
currentFailedTableEpochs.forEach(failedTableEpochs::remove);
404421
return lakeTieringInfo;
405422
}
@@ -428,7 +445,7 @@ private void generateTieringSplits(Tuple3<Long, Long, TablePath> tieringTable)
428445
LOG.info(
429446
"Generate Tiering splits for table {} is empty, no need to tier data.",
430447
tieringTable.f2.getTableName());
431-
finishedTableEpochs.put(tieringTable.f0, tieringTable.f1);
448+
finishedTables.put(tieringTable.f0, TieringFinishState.from(tieringTable.f1));
432449
} else {
433450
tieringTableEpochs.put(tieringTable.f0, tieringTable.f1);
434451
pendingSplits.addAll(tieringSplits);
@@ -438,7 +455,9 @@ private void generateTieringSplits(Tuple3<Long, Long, TablePath> tieringTable)
438455
context.runInCoordinatorThread(
439456
() ->
440457
handleTableTieringReachMaxDuration(
441-
tieringTable.f0, tieringTable.f1)),
458+
tablePath,
459+
tieringTable.f0,
460+
tieringTable.f1)),
442461

443462
// for simplicity, we use the freshness as
444463
tableInfo.getTableConfig().getDataLakeFreshness().toMillis(),
@@ -537,16 +556,28 @@ static LakeTieringHeartbeatRequest heartBeatWithRequestNewTieringTable(
537556
static LakeTieringHeartbeatRequest tieringTableHeartBeat(
538557
LakeTieringHeartbeatRequest heartbeatRequest,
539558
Map<Long, Long> tieringTableEpochs,
540-
Map<Long, Long> finishedTableEpochs,
559+
Map<Long, TieringFinishState> finishedTables,
541560
Map<Long, Long> failedTableEpochs,
542561
int coordinatorEpoch) {
543562
if (!tieringTableEpochs.isEmpty()) {
544563
heartbeatRequest.addAllTieringTables(
545564
toPbHeartbeatReqForTable(tieringTableEpochs, coordinatorEpoch));
546565
}
547-
if (!finishedTableEpochs.isEmpty()) {
566+
if (!finishedTables.isEmpty()) {
567+
Map<Long, Long> finishTieringEpochs = new HashMap<>();
568+
Set<Long> forceFinishedTables = new HashSet<>();
569+
finishedTables.forEach(
570+
(tableId, tieringFinishState) -> {
571+
finishTieringEpochs.put(tableId, tieringFinishState.tieringEpoch);
572+
if (tieringFinishState.isForceToFinish) {
573+
forceFinishedTables.add(tableId);
574+
}
575+
});
548576
heartbeatRequest.addAllFinishedTables(
549-
toPbHeartbeatReqForTable(finishedTableEpochs, coordinatorEpoch));
577+
toPbHeartbeatReqForTable(finishTieringEpochs, coordinatorEpoch));
578+
for (long forceFinishedTableId : forceFinishedTables) {
579+
heartbeatRequest.addForceFinishedTable(forceFinishedTableId);
580+
}
550581
}
551582
// add failed tiering table to heart beat request
552583
return failedTableHeartBeat(heartbeatRequest, failedTableEpochs, coordinatorEpoch);
@@ -590,4 +621,22 @@ static LakeTieringHeartbeatResponse waitHeartbeatResponse(
590621
}
591622
}
592623
}
624+
625+
private static class TieringFinishState {
626+
long tieringEpoch;
627+
boolean isForceToFinish;
628+
629+
public static TieringFinishState from(long tieringEpoch) {
630+
return new TieringFinishState(tieringEpoch, false);
631+
}
632+
633+
public static TieringFinishState from(long tieringEpoch, boolean isForceToFinish) {
634+
return new TieringFinishState(tieringEpoch, isForceToFinish);
635+
}
636+
637+
private TieringFinishState(long tieringEpoch, boolean isForceToFinish) {
638+
this.tieringEpoch = tieringEpoch;
639+
this.isForceToFinish = isForceToFinish;
640+
}
641+
}
593642
}

fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/tiering/FlinkTieringTestBase.java

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -170,13 +170,6 @@ protected void writeRows(TablePath tablePath, List<InternalRow> rows, boolean ap
170170
}
171171
}
172172

173-
protected void waitUntilSnapshot(long tableId, int bucketNum, long snapshotId) {
174-
for (int i = 0; i < bucketNum; i++) {
175-
TableBucket tableBucket = new TableBucket(tableId, i);
176-
FLUSS_CLUSTER_EXTENSION.waitUntilSnapshotFinished(tableBucket, snapshotId);
177-
}
178-
}
179-
180173
public List<InternalRow> getValuesRecords(TablePath tablePath) {
181174
return TestingValuesLake.getResults(tablePath.toString());
182175
}

0 commit comments

Comments
 (0)