@@ -168,13 +168,15 @@ private RemotingCommand process(boolean receiveNewJob,
168168
169169 // 判断是否接受新任务
170170 if (receiveNewJob ) {
171- // 查看有没有其他可以执行的任务
172- JobPushRequest jobPushRequest = getNewJob (taskTrackerNodeGroup , taskTrackerIdentity );
173- // 返回 新的任务
174- return RemotingCommand .createResponseCommand (RemotingProtos
175- .ResponseCode .SUCCESS .code (), jobPushRequest );
171+ try {
172+ // 查看有没有其他可以执行的任务
173+ JobPushRequest jobPushRequest = getNewJob (taskTrackerNodeGroup , taskTrackerIdentity );
174+ // 返回 新的任务
175+ return RemotingCommand .createResponseCommand (RemotingProtos
176+ .ResponseCode .SUCCESS .code (), jobPushRequest );
177+ } catch (Exception ignored ) {
178+ }
176179 }
177-
178180 // 返回给 任务执行端
179181 return RemotingCommand .createResponseCommand (RemotingProtos
180182 .ResponseCode .SUCCESS .code ());
@@ -276,7 +278,9 @@ private JobPushRequest getNewJob(String taskTrackerNodeGroup, String taskTracker
276278 try {
277279 application .getExecutingJobQueue ().add (jobPo );
278280 } catch (DuplicateJobException e ) {
279- throw e ;
281+ LOGGER .warn (e .getMessage (), e );
282+ application .getExecutableJobQueue ().resume (jobPo );
283+ return null ;
280284 }
281285 application .getExecutableJobQueue ().remove (jobPo .getTaskTrackerNodeGroup (), jobPo .getJobId ());
282286
@@ -304,9 +308,6 @@ private void finishProcess(List<TaskTrackerJobResult> results) {
304308 for (TaskTrackerJobResult result : results ) {
305309
306310 JobWrapper jobWrapper = result .getJobWrapper ();
307- // 从正在执行的队列中移除 TODO 如果在这个时候down机了,数据丢失了
308- application .getExecutingJobQueue ().remove (jobWrapper .getJobId ());
309-
310311 if (jobWrapper .getJob ().isSchedule ()) {
311312
312313 JobPo cronJobPo = application .getCronJobQueue ().finish (jobWrapper .getJobId ());
@@ -330,6 +331,9 @@ private void finishProcess(List<TaskTrackerJobResult> results) {
330331 LOGGER .error (e .getMessage (), e );
331332 }
332333 }
334+ // 从正在执行的队列中移除
335+ application .getExecutingJobQueue ().remove (jobWrapper .getJobId ());
336+
333337 }
334338 }
335339
@@ -347,9 +351,6 @@ private void retryProcess(List<TaskTrackerJobResult> results) {
347351 JobPo jobPo = application .getExecutingJobQueue ().get (jobWrapper .getJobId ());
348352 if (jobPo != null ) {
349353
350- // 从正在执行的队列中移除 TODO 如果在这个时候down机了,数据丢失了
351- application .getExecutingJobQueue ().remove (jobPo .getJobId ());
352-
353354 // 重试次数+1
354355 jobPo .setRetryTimes ((jobPo .getRetryTimes () == null ? 0 : jobPo .getRetryTimes ()) + 1 );
355356 Long nextRetryTriggerTime = DateUtils .addMinute (new Date (), jobPo .getRetryTimes ()).getTime ();
@@ -389,6 +390,10 @@ private void retryProcess(List<TaskTrackerJobResult> results) {
389390 LOGGER .error (e .getMessage (), e );
390391 }
391392 }
393+
394+ // 从正在执行的队列中移除
395+ application .getExecutingJobQueue ().remove (jobPo .getJobId ());
396+
392397 }
393398 }
394399 }
0 commit comments