|
14 | 14 | # limitations under the License. |
15 | 15 |
|
16 | 16 | from __future__ import absolute_import |
| 17 | +from oslo_config import cfg |
17 | 18 |
|
18 | 19 | from orquesta import statuses |
19 | | - |
| 20 | +from tooz.coordination import GroupNotCreated |
| 21 | +from st2common.services import coordination |
| 22 | +from eventlet.semaphore import Semaphore |
| 23 | +from eventlet import spawn_after |
20 | 24 | from st2common.constants import action as ac_const |
21 | 25 | from st2common import log as logging |
22 | 26 | from st2common.metrics import base as metrics |
23 | 27 | from st2common.models.db import execution as ex_db_models |
24 | 28 | from st2common.models.db import workflow as wf_db_models |
25 | 29 | from st2common.persistence import liveaction as lv_db_access |
26 | 30 | from st2common.persistence import workflow as wf_db_access |
| 31 | +from st2common.persistence import execution as ex_db_access |
| 32 | +from st2common.services import action as ac_svc |
27 | 33 | from st2common.services import policies as pc_svc |
28 | 34 | from st2common.services import workflows as wf_svc |
29 | 35 | from st2common.transport import consumers |
30 | 36 | from st2common.transport import queues |
31 | 37 | from st2common.transport import utils as txpt_utils |
32 | | - |
| 38 | +from st2common.util import concurrency |
| 39 | +from st2common.util import action_db as action_utils |
33 | 40 |
|
34 | 41 | LOG = logging.getLogger(__name__) |
35 | 42 |
|
|
40 | 47 | queues.WORKFLOW_ACTION_EXECUTION_UPDATE_QUEUE, |
41 | 48 | ] |
42 | 49 |
|
| 50 | +WORKFLOW_ENGINE = "workflow_engine" |
| 51 | +WORKFLOW_ENGINE_START_STOP_SEQ = "workflow_engine_start_stop_seq" |
| 52 | + |
43 | 53 |
|
44 | 54 | class WorkflowExecutionHandler(consumers.VariableMessageHandler): |
45 | 55 | def __init__(self, connection, queues): |
46 | 56 | super(WorkflowExecutionHandler, self).__init__(connection, queues) |
| 57 | + self._active_messages = 0 |
| 58 | + self._semaphore = Semaphore() |
| 59 | + # This is required to ensure workflows stuck in pausing state after shutdown transition to paused state after engine startup. |
| 60 | + self._delay = 30 |
47 | 61 |
|
48 | 62 | def handle_workflow_execution_with_instrumentation(wf_ex_db): |
49 | 63 | with metrics.CounterWithTimer(key="orquesta.workflow.executions"): |
@@ -78,13 +92,69 @@ def process(self, message): |
78 | 92 | raise ValueError(msg) |
79 | 93 |
|
80 | 94 | try: |
| 95 | + with self._semaphore: |
| 96 | + self._active_messages += 1 |
81 | 97 | handler_function(message) |
82 | 98 | except Exception as e: |
83 | 99 | # If the exception is caused by DB connection error, then the following |
84 | 100 | # error handling routine will fail as well because it will try to update |
85 | 101 | # the database and fail the workflow execution gracefully. In this case, |
86 | 102 | # the garbage collector will find and cancel these workflow executions. |
87 | 103 | self.fail_workflow_execution(message, e) |
| 104 | + finally: |
| 105 | + with self._semaphore: |
| 106 | + self._active_messages -= 1 |
| 107 | + |
| 108 | + def start(self, wait): |
| 109 | + spawn_after(self._delay, self._resume_workflows_paused_during_shutdown) |
| 110 | + super(WorkflowExecutionHandler, self).start(wait=wait) |
| 111 | + |
| 112 | + def shutdown(self): |
| 113 | + super(WorkflowExecutionHandler, self).shutdown() |
| 114 | + exit_timeout = cfg.CONF.workflow_engine.exit_still_active_check |
| 115 | + sleep_delay = cfg.CONF.workflow_engine.still_active_check_interval |
| 116 | + timeout = 0 |
| 117 | + |
| 118 | + while timeout < exit_timeout and self._active_messages > 0: |
| 119 | + concurrency.sleep(sleep_delay) |
| 120 | + timeout += sleep_delay |
| 121 | + |
| 122 | + coordinator = coordination.get_coordinator() |
| 123 | + member_ids = [] |
| 124 | + with coordinator.get_lock(WORKFLOW_ENGINE_START_STOP_SEQ): |
| 125 | + try: |
| 126 | + group_id = coordination.get_group_id(WORKFLOW_ENGINE) |
| 127 | + member_ids = list(coordinator.get_members(group_id).get()) |
| 128 | + except GroupNotCreated: |
| 129 | + pass |
| 130 | + |
| 131 | + # Check if there are other WFEs in service registry |
| 132 | + if cfg.CONF.coordination.service_registry and not member_ids: |
| 133 | + ac_ex_dbs = self._get_running_workflows() |
| 134 | + for ac_ex_db in ac_ex_dbs: |
| 135 | + lv_ac = action_utils.get_liveaction_by_id(ac_ex_db.liveaction["id"]) |
| 136 | + ac_svc.request_pause(lv_ac, WORKFLOW_ENGINE_START_STOP_SEQ) |
| 137 | + |
| 138 | + def _get_running_workflows(self): |
| 139 | + query_filters = { |
| 140 | + "runner__name": "orquesta", |
| 141 | + "status": ac_const.LIVEACTION_STATUS_RUNNING, |
| 142 | + } |
| 143 | + return ex_db_access.ActionExecution.query(**query_filters) |
| 144 | + |
| 145 | + def _get_workflows_paused_during_shutdown(self): |
| 146 | + query_filters = { |
| 147 | + "status": ac_const.LIVEACTION_STATUS_PAUSED, |
| 148 | + "context__paused_by": WORKFLOW_ENGINE_START_STOP_SEQ, |
| 149 | + } |
| 150 | + return lv_db_access.LiveAction.query(**query_filters) |
| 151 | + |
| 152 | + def _resume_workflows_paused_during_shutdown(self): |
| 153 | + coordinator = coordination.get_coordinator() |
| 154 | + with coordinator.get_lock(WORKFLOW_ENGINE_START_STOP_SEQ): |
| 155 | + lv_ac_dbs = self._get_workflows_paused_during_shutdown() |
| 156 | + for lv_ac_db in lv_ac_dbs: |
| 157 | + ac_svc.request_resume(lv_ac_db, WORKFLOW_ENGINE_START_STOP_SEQ) |
88 | 158 |
|
89 | 159 | def fail_workflow_execution(self, message, exception): |
90 | 160 | # Prepare attributes based on message type. |
|
0 commit comments