Skip to content

Commit 5c1107a

Browse files
author
Khushboo
committed
Workflow engine graceful shutdown
1 parent 721aa69 commit 5c1107a

File tree

3 files changed

+124
-2
lines changed

3 files changed

+124
-2
lines changed

st2actions/st2actions/workflows/workflows.py

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,29 @@
1414
# limitations under the License.
1515

1616
from __future__ import absolute_import
17+
from oslo_config import cfg
1718

1819
from orquesta import statuses
19-
20+
from tooz.coordination import GroupNotCreated
21+
from st2common.services import coordination
22+
from eventlet.semaphore import Semaphore
23+
from eventlet import spawn_after
2024
from st2common.constants import action as ac_const
2125
from st2common import log as logging
2226
from st2common.metrics import base as metrics
2327
from st2common.models.db import execution as ex_db_models
2428
from st2common.models.db import workflow as wf_db_models
2529
from st2common.persistence import liveaction as lv_db_access
2630
from st2common.persistence import workflow as wf_db_access
31+
from st2common.persistence import execution as ex_db_access
32+
from st2common.services import action as ac_svc
2733
from st2common.services import policies as pc_svc
2834
from st2common.services import workflows as wf_svc
2935
from st2common.transport import consumers
3036
from st2common.transport import queues
3137
from st2common.transport import utils as txpt_utils
32-
38+
from st2common.util import concurrency
39+
from st2common.util import action_db as action_utils
3340

3441
LOG = logging.getLogger(__name__)
3542

@@ -40,10 +47,15 @@
4047
queues.WORKFLOW_ACTION_EXECUTION_UPDATE_QUEUE,
4148
]
4249

50+
WORKFLOW_ENGINE = "workflow_engine"
51+
SHUTDOWN_ROUTINE = "shutdown_routine"
52+
4353

4454
class WorkflowExecutionHandler(consumers.VariableMessageHandler):
4555
def __init__(self, connection, queues):
4656
super(WorkflowExecutionHandler, self).__init__(connection, queues)
57+
self._active_messages = 0
58+
self._semaphore = Semaphore()
4759

4860
def handle_workflow_execution_with_instrumentation(wf_ex_db):
4961
with metrics.CounterWithTimer(key="orquesta.workflow.executions"):
@@ -62,6 +74,10 @@ def handle_action_execution_with_instrumentation(ac_ex_db):
6274
ex_db_models.ActionExecutionDB: handle_action_execution_with_instrumentation,
6375
}
6476

77+
# This is required to ensure workflows stuck in pausing state after shutdown transition to paused state after engine startup.
78+
self._delay = 30
79+
spawn_after(self._delay, self._resume_workflows_paused_during_shutdown)
80+
6581
def get_queue_consumer(self, connection, queues):
6682
# We want to use a special ActionsQueueConsumer which uses 2 dispatcher pools
6783
return consumers.VariableMessageQueueConsumer(
@@ -78,13 +94,61 @@ def process(self, message):
7894
raise ValueError(msg)
7995

8096
try:
97+
with self._semaphore:
98+
self._active_messages += 1
8199
handler_function(message)
82100
except Exception as e:
83101
# If the exception is caused by DB connection error, then the following
84102
# error handling routine will fail as well because it will try to update
85103
# the database and fail the workflow execution gracefully. In this case,
86104
# the garbage collector will find and cancel these workflow executions.
87105
self.fail_workflow_execution(message, e)
106+
finally:
107+
with self._semaphore:
108+
self._active_messages -= 1
109+
110+
def shutdown(self):
111+
super(WorkflowExecutionHandler, self).shutdown()
112+
while self._active_messages > 0:
113+
concurrency.sleep(2)
114+
115+
coordinator = coordination.get_coordinator()
116+
member_ids = []
117+
with coordinator.get_lock(SHUTDOWN_ROUTINE):
118+
try:
119+
member_ids = list(
120+
coordinator.get_members(WORKFLOW_ENGINE.encode("utf-8")).get()
121+
)
122+
except GroupNotCreated:
123+
pass
124+
125+
# Check if there are other runners in service registry
126+
if cfg.CONF.coordination.service_registry and not member_ids:
127+
ac_ex_dbs = self._get_running_workflows()
128+
for ac_ex_db in ac_ex_dbs:
129+
lv_ac = action_utils.get_liveaction_by_id(ac_ex_db.liveaction["id"])
130+
ac_svc.request_pause(lv_ac, SHUTDOWN_ROUTINE)
131+
132+
def _get_running_workflows(self):
133+
query_filters = {
134+
"runner__name": "orquesta",
135+
"status": ac_const.LIVEACTION_STATUS_RUNNING,
136+
}
137+
return ex_db_access.ActionExecution.query(**query_filters)
138+
139+
def _get_workflows_paused_during_shutdown(self):
140+
query_filters = {
141+
"status": ac_const.LIVEACTION_STATUS_PAUSED,
142+
"context__paused_by": SHUTDOWN_ROUTINE,
143+
}
144+
return lv_db_access.LiveAction.query(**query_filters)
145+
146+
def _resume_workflows_paused_during_shutdown(self):
147+
coordinator = coordination.get_coordinator()
148+
with coordinator.get_lock(SHUTDOWN_ROUTINE):
149+
lv_ac_dbs = self._get_workflows_paused_during_shutdown()
150+
for lv_ac_db in lv_ac_dbs:
151+
ac_svc.request_resume(lv_ac_db, SHUTDOWN_ROUTINE)
88152

89153
def fail_workflow_execution(self, message, exception):
90154
# Prepare attributes based on message type.

st2actions/tests/unit/test_workflow_engine.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,3 +271,60 @@ def test_process_error_handling_has_error(self, mock_get_lock):
271271
# Assert workflow execution is cleaned up and canceled.
272272
lv_ac_db = lv_db_access.LiveAction.get_by_id(str(lv_ac_db.id))
273273
self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_CANCELED)
274+
275+
def test_workflow_engine_shutdown(self):
276+
cfg.CONF.set_override(
277+
name="service_registry", override=True, group="coordination"
278+
)
279+
wf_meta = self.get_wf_fixture_meta_data(TEST_PACK_PATH, "sequential.yaml")
280+
lv_ac_db = lv_db_models.LiveActionDB(action=wf_meta["name"])
281+
lv_ac_db, ac_ex_db = action_service.request(lv_ac_db)
282+
283+
# Assert action execution is running.
284+
lv_ac_db = lv_db_access.LiveAction.get_by_id(str(lv_ac_db.id))
285+
self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_RUNNING)
286+
wf_ex_db = wf_db_access.WorkflowExecution.query(
287+
action_execution=str(ac_ex_db.id)
288+
)[0]
289+
self.assertEqual(wf_ex_db.status, action_constants.LIVEACTION_STATUS_RUNNING)
290+
workflow_engine = workflows.get_engine()
291+
292+
# Manually add running workflow
293+
workflow_engine._handling_workflows = [str(ac_ex_db.id)]
294+
eventlet.spawn(workflow_engine.shutdown)
295+
296+
# Sleep for few seconds to ensure execution transitions to pausing.
297+
eventlet.sleep(5)
298+
299+
lv_ac_db = lv_db_access.LiveAction.get_by_id(str(lv_ac_db.id))
300+
self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_PAUSING)
301+
302+
# Process task1.
303+
query_filters = {"workflow_execution": str(wf_ex_db.id), "task_id": "task1"}
304+
t1_ex_db = wf_db_access.TaskExecution.query(**query_filters)[0]
305+
t1_ac_ex_db = ex_db_access.ActionExecution.query(
306+
task_execution=str(t1_ex_db.id)
307+
)[0]
308+
309+
workflows.get_engine().process(t1_ac_ex_db)
310+
t1_ac_ex_db = ex_db_access.ActionExecution.query(
311+
task_execution=str(t1_ex_db.id)
312+
)[0]
313+
self.assertEqual(
314+
t1_ac_ex_db.status, action_constants.LIVEACTION_STATUS_SUCCEEDED
315+
)
316+
317+
lv_ac_db = lv_db_access.LiveAction.get_by_id(str(lv_ac_db.id))
318+
self.assertEqual(lv_ac_db.status, action_constants.LIVEACTION_STATUS_PAUSED)
319+
320+
workflow_engine = workflows.get_engine()
321+
eventlet.sleep(workflow_engine._delay)
322+
lv_ac_db = lv_db_access.LiveAction.get_by_id(str(lv_ac_db.id))
323+
self.assertTrue(
324+
lv_ac_db.status
325+
in [
326+
action_constants.LIVEACTION_STATUS_RESUMING,
327+
action_constants.LIVEACTION_STATUS_RUNNING,
328+
action_constants.LIVEACTION_STATUS_SUCCEEDED,
329+
]
330+
)

st2common/st2common/transport/consumers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def __init__(self, connection, queues, handler):
4343
self._handler = handler
4444

4545
def shutdown(self):
46+
self.should_stop = True
4647
self._dispatcher.shutdown()
4748

4849
def get_consumers(self, Consumer, channel):

0 commit comments

Comments
 (0)