Skip to content

Commit 43c2ee1

Browse files
authored
[AMORO-3794] Wrap task operation with process lock (#3798)
* wrap task operation with process lock * Fix some checkstyle errors * Add a lock timeout while polling task
1 parent 0d7a006 commit 43c2ee1

File tree

5 files changed

+148
-129
lines changed

5 files changed

+148
-129
lines changed

amoro-ams/src/main/java/org/apache/amoro/server/DefaultOptimizingService.java

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import org.apache.amoro.exception.IllegalTaskStateException;
3333
import org.apache.amoro.exception.ObjectNotExistsException;
3434
import org.apache.amoro.exception.PluginRetryAuthException;
35-
import org.apache.amoro.exception.TaskNotFoundException;
3635
import org.apache.amoro.resource.ResourceGroup;
3736
import org.apache.amoro.server.catalog.CatalogManager;
3837
import org.apache.amoro.server.optimizing.OptimizingProcess;
@@ -54,7 +53,6 @@
5453
import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions;
5554
import org.apache.amoro.shade.guava32.com.google.common.collect.Sets;
5655
import org.apache.amoro.shade.guava32.com.google.common.util.concurrent.ThreadFactoryBuilder;
57-
import org.apache.amoro.shade.thrift.org.apache.thrift.TException;
5856
import org.apache.commons.lang3.StringUtils;
5957
import org.jetbrains.annotations.NotNull;
6058
import org.slf4j.Logger;
@@ -214,33 +212,21 @@ private OptimizerInstance getAuthenticatedOptimizer(String authToken) {
214212
@Override
215213
public OptimizingTask pollTask(String authToken, int threadId) {
216214
LOG.debug("Optimizer {} (threadId {}) try polling task", authToken, threadId);
215+
OptimizerThread optimizerThread = getAuthenticatedOptimizer(authToken).getThread(threadId);
217216
OptimizingQueue queue = getQueueByToken(authToken);
218-
return Optional.ofNullable(queue.pollTask(pollingTimeout, breakQuotaLimit))
219-
.map(task -> extractOptimizingTask(task, authToken, threadId, queue))
220-
.orElse(null);
221-
}
222-
223-
private OptimizingTask extractOptimizingTask(
224-
TaskRuntime<?> task, String authToken, int threadId, OptimizingQueue queue) {
225-
try {
226-
OptimizerThread optimizerThread = getAuthenticatedOptimizer(authToken).getThread(threadId);
227-
task.schedule(optimizerThread);
217+
TaskRuntime<?> task = queue.pollTask(optimizerThread, pollingTimeout, breakQuotaLimit);
218+
if (task != null) {
228219
LOG.info("OptimizerThread {} polled task {}", optimizerThread, task.getTaskId());
229220
return task.extractProtocolTask();
230-
} catch (Throwable throwable) {
231-
LOG.error("Schedule task {} failed, put it to retry queue", task.getTaskId(), throwable);
232-
queue.retryTask(task);
233-
return null;
234221
}
222+
return null;
235223
}
236224

237225
@Override
238226
public void ackTask(String authToken, int threadId, OptimizingTaskId taskId) {
239227
LOG.info("Ack task {} by optimizer {} (threadId {})", taskId, authToken, threadId);
240228
OptimizingQueue queue = getQueueByToken(authToken);
241-
Optional.ofNullable(queue.getTask(taskId))
242-
.orElseThrow(() -> new TaskNotFoundException(taskId))
243-
.ack(getAuthenticatedOptimizer(authToken).getThread(threadId));
229+
queue.ackTask(taskId, getAuthenticatedOptimizer(authToken).getThread(threadId));
244230
}
245231

246232
@Override
@@ -254,9 +240,7 @@ public void completeTask(String authToken, OptimizingTaskResult taskResult) {
254240
OptimizingQueue queue = getQueueByToken(authToken);
255241
OptimizerThread thread =
256242
getAuthenticatedOptimizer(authToken).getThread(taskResult.getThreadId());
257-
Optional.ofNullable(queue.getTask(taskResult.getTaskId()))
258-
.orElseThrow(() -> new TaskNotFoundException(taskResult.getTaskId()))
259-
.complete(thread, taskResult);
243+
queue.completeTask(thread, taskResult);
260244
}
261245

262246
@Override
@@ -284,7 +268,7 @@ public String authenticate(OptimizerRegisterInfo registerInfo) {
284268
}
285269

286270
@Override
287-
public boolean cancelProcess(long processId) throws TException {
271+
public boolean cancelProcess(long processId) {
288272
TableProcessMeta processMeta =
289273
getAs(TableProcessMapper.class, m -> m.getProcessMeta(processId));
290274
if (processMeta == null) {

amoro-ams/src/main/java/org/apache/amoro/server/optimizing/OptimizingQueue.java

Lines changed: 87 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
import org.apache.amoro.ServerTableIdentifier;
2424
import org.apache.amoro.api.BlockableOperation;
2525
import org.apache.amoro.api.OptimizingTaskId;
26+
import org.apache.amoro.api.OptimizingTaskResult;
2627
import org.apache.amoro.exception.OptimizingClosedException;
2728
import org.apache.amoro.exception.PersistenceException;
29+
import org.apache.amoro.exception.TaskNotFoundException;
2830
import org.apache.amoro.optimizing.MetricsSummary;
2931
import org.apache.amoro.optimizing.OptimizingType;
3032
import org.apache.amoro.optimizing.RewriteFilesInput;
@@ -44,6 +46,7 @@
4446
import org.apache.amoro.server.persistence.mapper.TableProcessMapper;
4547
import org.apache.amoro.server.process.TableProcessMeta;
4648
import org.apache.amoro.server.resource.OptimizerInstance;
49+
import org.apache.amoro.server.resource.OptimizerThread;
4750
import org.apache.amoro.server.resource.QuotaProvider;
4851
import org.apache.amoro.server.table.DefaultTableRuntime;
4952
import org.apache.amoro.server.table.blocker.TableBlocker;
@@ -90,7 +93,6 @@ public class OptimizingQueue extends PersistentBase {
9093

9194
private final QuotaProvider quotaProvider;
9295
private final Queue<TableOptimizingProcess> tableQueue = new LinkedTransferQueue<>();
93-
private final Queue<TaskRuntime<?>> retryTaskQueue = new LinkedTransferQueue<>();
9496
private final SchedulingPolicy scheduler;
9597
private final CatalogManager catalogManager;
9698
private final Executor planExecutor;
@@ -200,24 +202,23 @@ public void releaseTable(DefaultTableRuntime tableRuntime) {
200202

201203
private void clearProcess(OptimizingProcess optimizingProcess) {
202204
tableQueue.removeIf(process -> process.getProcessId() == optimizingProcess.getProcessId());
203-
retryTaskQueue.removeIf(
204-
taskRuntime -> taskRuntime.getTaskId().getProcessId() == optimizingProcess.getProcessId());
205205
}
206206

207-
public TaskRuntime<?> pollTask(long maxWaitTime, boolean breakQuotaLimit) {
207+
public TaskRuntime<?> pollTask(
208+
OptimizerThread thread, long maxWaitTime, boolean breakQuotaLimit) {
208209
long deadline = calculateDeadline(maxWaitTime);
209-
TaskRuntime<?> task = fetchTask();
210+
TaskRuntime<?> task = fetchScheduledTask(thread, true);
210211
while (task == null && waitTask(deadline)) {
211-
task = fetchTask();
212+
task = fetchScheduledTask(thread, true);
212213
}
213214
if (task == null && breakQuotaLimit && planningTables.isEmpty()) {
214-
task = fetchScheduledTask(false);
215+
task = fetchScheduledTask(thread, false);
215216
}
216217
return task;
217218
}
218219

219-
public TaskRuntime<?> pollTask(long maxWaitTime) {
220-
return pollTask(maxWaitTime, false);
220+
public TaskRuntime<?> pollTask(OptimizerThread thread, long maxWaitTime) {
221+
return pollTask(thread, maxWaitTime, true);
221222
}
222223

223224
private long calculateDeadline(long maxWaitTime) {
@@ -240,14 +241,9 @@ private boolean waitTask(long waitDeadline) {
240241
}
241242
}
242243

243-
private TaskRuntime<?> fetchTask() {
244-
TaskRuntime<?> task = retryTaskQueue.poll();
245-
return task != null ? task : fetchScheduledTask(true);
246-
}
247-
248-
private TaskRuntime<?> fetchScheduledTask(boolean needQuotaChecking) {
244+
private TaskRuntime<?> fetchScheduledTask(OptimizerThread thread, boolean needQuotaChecking) {
249245
return tableQueue.stream()
250-
.map(process -> process.poll(needQuotaChecking))
246+
.map(process -> process.poll(thread, needQuotaChecking))
251247
.filter(Objects::nonNull)
252248
.findFirst()
253249
.orElse(null);
@@ -352,12 +348,12 @@ private TableOptimizingProcess planInternal(DefaultTableRuntime tableRuntime) {
352348
}
353349
}
354350

355-
public TaskRuntime<?> getTask(OptimizingTaskId taskId) {
356-
return tableQueue.stream()
357-
.filter(p -> p.getProcessId() == taskId.getProcessId())
358-
.findFirst()
359-
.map(p -> p.getTaskMap().get(taskId))
360-
.orElse(null);
351+
public void ackTask(OptimizingTaskId taskId, OptimizerThread thread) {
352+
findProcess(taskId).ackTask(taskId, thread);
353+
}
354+
355+
public void completeTask(OptimizerThread thread, OptimizingTaskResult result) {
356+
findProcess(result.getTaskId()).completeTask(thread, result);
361357
}
362358

363359
public List<TaskRuntime<?>> collectTasks() {
@@ -374,8 +370,7 @@ public List<TaskRuntime<?>> collectTasks(Predicate<TaskRuntime<?>> predicate) {
374370
}
375371

376372
public void retryTask(TaskRuntime<?> taskRuntime) {
377-
taskRuntime.reset();
378-
retryTaskQueue.offer(taskRuntime);
373+
findProcess(taskRuntime.getTaskId()).resetTask((TaskRuntime<RewriteStageTask>) taskRuntime);
379374
}
380375

381376
public ResourceGroup getOptimizerGroup() {
@@ -402,6 +397,13 @@ public void dispose() {
402397
this.metrics.unregister();
403398
}
404399

400+
private TableOptimizingProcess findProcess(OptimizingTaskId taskId) {
401+
return tableQueue.stream()
402+
.filter(p -> p.getProcessId() == taskId.getProcessId())
403+
.findFirst()
404+
.orElseThrow(() -> new TaskNotFoundException(taskId));
405+
}
406+
405407
private double getAvailableCore() {
406408
// the available core should be at least 1
407409
return Math.max(quotaProvider.getTotalQuota(optimizerGroup.getName()), 1);
@@ -437,26 +439,32 @@ private class TableOptimizingProcess implements OptimizingProcess {
437439
private Map<String, Long> toSequence = Maps.newHashMap();
438440
private boolean hasCommitted = false;
439441

440-
public TaskRuntime<?> poll(boolean needQuotaChecking) {
441-
if (lock.tryLock()) {
442-
try {
443-
TaskRuntime<?> task = null;
444-
if (status != ProcessStatus.KILLED && status != ProcessStatus.FAILED) {
445-
int actualQuota = getActualQuota();
446-
int quotaLimit = getQuotaLimit();
447-
if (!needQuotaChecking || actualQuota < quotaLimit) {
448-
task = taskQueue.poll();
442+
public TaskRuntime<?> poll(OptimizerThread thread, boolean needQuotaChecking) {
443+
try {
444+
// Wait 10ms here for some light operation like poll/ack
445+
if (lock.tryLock(10, TimeUnit.MILLISECONDS)) {
446+
try {
447+
TaskRuntime<?> task = null;
448+
if (status != ProcessStatus.KILLED && status != ProcessStatus.FAILED) {
449+
int actualQuota = getActualQuota();
450+
int quotaLimit = getQuotaLimit();
451+
if (!needQuotaChecking || actualQuota < quotaLimit) {
452+
task = taskQueue.poll();
453+
}
449454
}
455+
if (task != null) {
456+
optimizingTasksMap
457+
.computeIfAbsent(tableRuntime.getTableIdentifier(), k -> new AtomicInteger(0))
458+
.incrementAndGet();
459+
task.schedule(thread);
460+
}
461+
return task;
462+
} finally {
463+
lock.unlock();
450464
}
451-
if (task != null) {
452-
optimizingTasksMap
453-
.computeIfAbsent(tableRuntime.getTableIdentifier(), k -> new AtomicInteger(0))
454-
.incrementAndGet();
455-
}
456-
return task;
457-
} finally {
458-
lock.unlock();
459465
}
466+
} catch (InterruptedException e) {
467+
// ignore it.
460468
}
461469
return null;
462470
}
@@ -543,6 +551,34 @@ public void close(boolean needCommit) {
543551
}
544552
}
545553

554+
private void ackTask(OptimizingTaskId taskId, OptimizerThread thread) {
555+
TaskRuntime<?> taskRuntime = getTaskRuntime(taskId);
556+
lock.lock();
557+
try {
558+
taskRuntime.ack(thread);
559+
} finally {
560+
lock.unlock();
561+
}
562+
}
563+
564+
private void completeTask(OptimizerThread thread, OptimizingTaskResult result) {
565+
TaskRuntime<?> taskRuntime = getTaskRuntime(result.getTaskId());
566+
lock.lock();
567+
try {
568+
taskRuntime.complete(thread, result);
569+
} finally {
570+
lock.unlock();
571+
}
572+
}
573+
574+
private TaskRuntime<?> getTaskRuntime(OptimizingTaskId taskId) {
575+
TaskRuntime<?> taskRuntime = getTaskMap().get(taskId);
576+
if (taskRuntime == null) {
577+
throw new TaskNotFoundException(taskId);
578+
}
579+
return taskRuntime;
580+
}
581+
546582
private void acceptResult(TaskRuntime<?> taskRuntime) {
547583
lock.lock();
548584
try {
@@ -612,6 +648,16 @@ private void acceptResult(TaskRuntime<?> taskRuntime) {
612648
}
613649
}
614650

651+
private void resetTask(TaskRuntime<RewriteStageTask> taskRuntime) {
652+
lock.lock();
653+
try {
654+
taskRuntime.reset();
655+
taskQueue.add(taskRuntime);
656+
} finally {
657+
lock.unlock();
658+
}
659+
}
660+
615661
@Override
616662
public boolean isClosed() {
617663
return status == ProcessStatus.KILLED;

amoro-ams/src/main/java/org/apache/amoro/server/optimizing/TaskRuntime.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ public SimpleFuture getCompletedFuture() {
8181
return future;
8282
}
8383

84-
public void complete(OptimizerThread thread, OptimizingTaskResult result) {
84+
void complete(OptimizerThread thread, OptimizingTaskResult result) {
8585
invokeConsistency(
8686
() -> {
8787
validThread(thread);
@@ -121,7 +121,7 @@ void reset() {
121121
});
122122
}
123123

124-
public void schedule(OptimizerThread thread) {
124+
void schedule(OptimizerThread thread) {
125125
invokeConsistency(
126126
() -> {
127127
statusMachine.accept(Status.SCHEDULED);
@@ -132,7 +132,7 @@ public void schedule(OptimizerThread thread) {
132132
});
133133
}
134134

135-
public void ack(OptimizerThread thread) {
135+
void ack(OptimizerThread thread) {
136136
invokeConsistency(
137137
() -> {
138138
validThread(thread);

amoro-ams/src/test/java/org/apache/amoro/server/dashboard/utils/TestOptimizingUtil.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,12 @@ public void testCalculateQuotaOccupy() {
9393
DefaultTableRuntime tableRuntime = initTableWithFiles();
9494
OptimizingQueue queue = buildOptimizingGroupService(tableRuntime);
9595
Assert.assertEquals(0, queue.collectTasks().size());
96-
TaskRuntime task = queue.pollTask(MAX_POLLING_TIME);
97-
task.schedule(optimizerThread);
98-
task.ack(optimizerThread);
96+
TaskRuntime<?> task = queue.pollTask(optimizerThread, MAX_POLLING_TIME);
97+
queue.ackTask(task.getTaskId(), optimizerThread);
9998
Assert.assertEquals(
10099
1, queue.collectTasks(t -> t.getStatus() == TaskRuntime.Status.ACKED).size());
101100
Assert.assertNotNull(task);
102-
task.complete(
101+
queue.completeTask(
103102
optimizerThread,
104103
buildOptimizingTaskResult(task.getTaskId(), optimizerThread.getThreadId()));
105104
Assert.assertEquals(TaskRuntime.Status.SUCCESS, task.getStatus());

0 commit comments

Comments
 (0)