fix: correct state race condition (#270)

sicoyle · web-flow · commit 0afa795e67d6 · 2025-11-11T16:25:38.000-05:00
* fix: only persist data once

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* style: rm debug log

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* style: appease linter

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* fix: also prevent duplicate tool msgs upon constructing chat history

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* fix: actually address race condition on parallel tool result state saves and use mem as signle source of truth if available

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* fix: always load state, add mem msg atomic and always save msgs to mem

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* style: lint fix

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* fix: appease flake*

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* style: last fix for flake8

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* fix: updates for tests

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* style: make lint happy

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* fix: bring changes to correct branch ugh

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

* fix: i hate the linter

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;

---------

Signed-off-by: Samantha Coyle &lt;sam@diagrid.io&gt;
diff --git a/dapr_agents/agents/base.py b/dapr_agents/agents/base.py
@@ -512,7 +512,7 @@ def _get_last_user_message(
     # ------------------------------------------------------------------
     # State-aware message helpers (use AgentComponents' state model)
     # ------------------------------------------------------------------
-    def _construct_messages_with_instance_history(
+    def _reconstruct_conversation_history(
         self, instance_id: str
     ) -> List[Dict[str, Any]]:
         """
@@ -546,10 +546,11 @@ def _construct_messages_with_instance_history(
         except Exception:  # noqa: BLE001
             logger.debug("Unable to load persistent memory.", exc_info=True)
 
-        history: List[Dict[str, Any]] = []
-        history.extend(persistent_memory)
-        history.extend(instance_messages)
-        return history
+        # Persistent conversation history in the memory config is the single source of truth for conversation history
+        if persistent_memory:
+            return persistent_memory
+        # Note: this is just ot make tests happy for now and in reality for durable agent this is not used for app resumption of state
+        return instance_messages
 
     def _sync_system_messages_with_state(
         self,
@@ -585,23 +586,22 @@ def _process_user_message(
 
         container = self._get_entry_container()
         entry = container.get(instance_id) if container else None
-        if entry is None or not hasattr(entry, "messages"):
-            return
-
-        # Use configured coercer / message model
-        message_model = (
-            self._message_coercer(user_message_copy)  # type: ignore[attr-defined]
-            if getattr(self, "_message_coercer", None)
-            else self._message_dict_to_message_model(user_message_copy)
-        )
-        entry.messages.append(message_model)  # type: ignore[attr-defined]
-        if hasattr(entry, "last_message"):
-            entry.last_message = message_model  # type: ignore[attr-defined]
+        if entry is not None and hasattr(entry, "messages"):
+            # Use configured coercer / message model
+            message_model = (
+                self._message_coercer(user_message_copy)  # type: ignore[attr-defined]
+                if getattr(self, "_message_coercer", None)
+                else self._message_dict_to_message_model(user_message_copy)
+            )
+            entry.messages.append(message_model)  # type: ignore[attr-defined]
+            if hasattr(entry, "last_message"):
+                entry.last_message = message_model  # type: ignore[attr-defined]
 
-        session_id = getattr(getattr(self, "memory", None), "session_id", None)
-        if session_id is not None and hasattr(entry, "session_id"):
-            entry.session_id = str(session_id)  # type: ignore[attr-defined]
+            session_id = getattr(getattr(self, "memory", None), "session_id", None)
+            if session_id is not None and hasattr(entry, "session_id"):
+                entry.session_id = str(session_id)  # type: ignore[attr-defined]
 
+        # Always add to memory (required for chat history for agent durability upon restarts)
         self.memory.add_message(
             UserMessage(content=user_message_copy.get("content", ""))
         )
@@ -621,24 +621,25 @@ def _save_assistant_message(
 
         container = self._get_entry_container()
         entry = container.get(instance_id) if container else None
-        if entry is None or not hasattr(entry, "messages"):
-            return
-
-        message_id = assistant_message.get("id")
-        if message_id and any(
-            getattr(msg, "id", None) == message_id for msg in getattr(entry, "messages")
-        ):
-            return
-
-        message_model = (
-            self._message_coercer(assistant_message)  # type: ignore[attr-defined]
-            if getattr(self, "_message_coercer", None)
-            else self._message_dict_to_message_model(assistant_message)
-        )
-        entry.messages.append(message_model)  # type: ignore[attr-defined]
-        if hasattr(entry, "last_message"):
-            entry.last_message = message_model  # type: ignore[attr-defined]
+        if entry is not None and hasattr(entry, "messages"):
+            message_id = assistant_message.get("id")
+            if message_id and any(
+                getattr(msg, "id", None) == message_id
+                for msg in getattr(entry, "messages")
+            ):
+                # Duplicate in state - skip state update but still add to memory
+                pass
+            else:
+                message_model = (
+                    self._message_coercer(assistant_message)  # type: ignore[attr-defined]
+                    if getattr(self, "_message_coercer", None)
+                    else self._message_dict_to_message_model(assistant_message)
+                )
+                entry.messages.append(message_model)  # type: ignore[attr-defined]
+                if hasattr(entry, "last_message"):
+                    entry.last_message = message_model  # type: ignore[attr-defined]
 
+        # Always add to memory (required for chat history)
         self.memory.add_message(AssistantMessage(**assistant_message))
         self.save_state()
 
diff --git a/dapr_agents/agents/durable.py b/dapr_agents/agents/durable.py
@@ -179,7 +179,8 @@ def agent_workflow(self, ctx: wf.DaprWorkflowContext, message: dict):
         source = metadata.get("source") or "direct"
 
         # Ensure we have the latest durable state for this turn.
-        self.load_state()
+        if self.state_store:
+            self.load_state()
 
         # Bootstrap instance entry (flexible to non-`instances` models).
         self.ensure_instance_exists(
@@ -369,6 +370,10 @@ def record_initial_entry(
                 - start_time: ISO8601 datetime string.
                 - trace_context: Optional tracing context.
         """
+        # Load latest state to ensure we have current data before modifying
+        if self.state_store:
+            self.load_state()
+
         instance_id = payload.get("instance_id")
         trace_context = payload.get("trace_context")
         input_value = payload.get("input_value", "Triggered without input.")
@@ -418,10 +423,14 @@ def call_llm(
         Raises:
             AgentError: If the LLM call fails or yields no message.
         """
+        # Load latest state to ensure we have current data
+        if self.state_store:
+            self.load_state()
+
         instance_id = payload.get("instance_id")
         task = payload.get("task")
 
-        chat_history = self._construct_messages_with_instance_history(instance_id)
+        chat_history = self._reconstruct_conversation_history(instance_id)
         messages = self.prompting_helper.build_initial_messages(
             user_input=task,
             chat_history=chat_history,
@@ -481,6 +490,10 @@ def run_tool(
         Raises:
             AgentError: If tool arguments contain invalid JSON.
         """
+        # Load latest state to ensure we have current data before modifying
+        if self.state_store:
+            self.load_state()
+
         tool_call = payload.get("tool_call", {})
         instance_id = payload.get("instance_id")
         fn_name = tool_call["function"]["name"]
@@ -548,8 +561,27 @@ async def _execute_tool() -> Any:
                 if hasattr(entry, "last_message"):
                     entry.last_message = tool_message_model
 
-        # Always persist to memory + in-process tool history
-        self.memory.add_message(tool_message)
+        tool_call_id = agent_message["tool_call_id"]
+        # Check if tool message already exists in memory
+        existing_memory_messages = self.memory.get_messages()
+        tool_exists_in_memory = False
+        for mem_msg in existing_memory_messages:
+            msg_dict = (
+                mem_msg.model_dump()
+                if hasattr(mem_msg, "model_dump")
+                else (mem_msg if isinstance(mem_msg, dict) else {})
+            )
+            if (
+                msg_dict.get("role") == "tool"
+                and msg_dict.get("tool_call_id") == tool_call_id
+            ):
+                tool_exists_in_memory = True
+                break
+
+        # Only add to persistent memory if not already present
+        if not tool_exists_in_memory:
+            self.memory.add_message(tool_message)
+
         self.tool_history.append(history_entry)
 
         # Print the tool result for visibility
@@ -647,6 +679,10 @@ def finalize_workflow(
             payload: Dict with 'instance_id', 'final_output', 'end_time',
                      and optional 'triggering_workflow_instance_id'.
         """
+        # Load latest state to ensure we have current data before modifying
+        if self.state_store:
+            self.load_state()
+
         instance_id = payload.get("instance_id")
         final_output = payload.get("final_output", "")
         end_time = payload.get("end_time", "")
diff --git a/dapr_agents/agents/standalone.py b/dapr_agents/agents/standalone.py
@@ -175,7 +175,7 @@ async def _run_agent(
         active_instance = instance_id or self._generate_instance_id()
 
         # Build initial messages with persistent + per-instance history
-        chat_history = self._construct_messages_with_instance_history(active_instance)
+        chat_history = self._reconstruct_conversation_history(active_instance)
         messages = self.prompting_helper.build_initial_messages(
             user_input=input_data,
             chat_history=chat_history,
@@ -236,7 +236,7 @@ def construct_messages(
         """
         self.load_state()
         active_instance = instance_id or self._generate_instance_id()
-        chat_history = self._construct_messages_with_instance_history(active_instance)
+        chat_history = self._reconstruct_conversation_history(active_instance)
         return self.prompting_helper.build_initial_messages(
             user_input=input_data,
             chat_history=chat_history,
diff --git a/dapr_agents/memory/daprstatestore.py b/dapr_agents/memory/daprstatestore.py
@@ -85,23 +85,56 @@ def add_message(self, message: Union[Dict[str, Any], BaseMessage]) -> None:
             message (Union[Dict[str, Any], BaseMessage]): The message to add to the memory.
         """
         message = self._convert_to_dict(message)
-        message_id = str(uuid.uuid4())
-        message_key = self._get_message_key(message_id)
         message.update(
             {
                 "createdAt": datetime.now().isoformat() + "Z",
             }
         )
-        existing = self.get_messages()
-        existing.append(message)
-        logger.debug(
-            f"Adding message {message} with key {message_key} to session {self.session_id}"
-        )
-        self.dapr_store.save_state(
-            self.session_id,
-            json.dumps(existing),
-            state_metadata={"contentType": "application/json"},
-        )
+
+        # Retry loop for optimistic concurrency control
+        # TODO: make this nicer in future, but for durability this must all be atomic
+        max_attempts = 10
+        for attempt in range(1, max_attempts + 1):
+            try:
+                response = self.dapr_store.get_state(
+                    self.session_id,
+                    state_metadata={"contentType": "application/json"},
+                )
+
+                if response and response.data:
+                    existing = json.loads(response.data)
+                    etag = response.etag
+                else:
+                    existing = []
+                    etag = None
+
+                existing.append(message)
+                # Save with etag - will fail if someone else modified it
+                self.dapr_store.save_state(
+                    self.session_id,
+                    json.dumps(existing),
+                    state_metadata={"contentType": "application/json"},
+                    etag=etag,
+                )
+
+                # Success - exit retry loop
+                return
+
+            except Exception as exc:
+                if attempt == max_attempts:
+                    logger.exception(
+                        f"Failed to add message to session {self.session_id} after {max_attempts} attempts: {exc}"
+                    )
+                    raise
+                else:
+                    logger.warning(
+                        f"Conflict adding message to session {self.session_id} (attempt {attempt}/{max_attempts}): {exc}, retrying..."
+                    )
+                    # Brief exponential backoff with jitter
+                    import time
+                    import random
+
+                    time.sleep(min(0.1 * attempt, 0.5) * (1 + random.uniform(0, 0.25)))
 
     def add_messages(self, messages: List[Union[Dict[str, Any], BaseMessage]]) -> None:
         """
diff --git a/tests/agents/durableagent/test_durable_agent.py b/tests/agents/durableagent/test_durable_agent.py
@@ -431,7 +431,9 @@ async def test_finish_workflow_activity(self, basic_durable_agent):
         # Mock the activity context and save_state
         mock_ctx = Mock()
 
-        with patch.object(basic_durable_agent, "save_state"):
+        with patch.object(basic_durable_agent, "save_state"), patch.object(
+            basic_durable_agent, "load_state"
+        ):
             basic_durable_agent.finalize_workflow(
                 mock_ctx,
                 {
@@ -487,7 +489,9 @@ def test_run_tool(self, basic_durable_agent, mock_tool):
             # Mock the activity context and save_state
             mock_ctx = Mock()
 
-            with patch.object(basic_durable_agent, "save_state"):
+            with patch.object(basic_durable_agent, "save_state"), patch.object(
+                basic_durable_agent, "load_state"
+            ):
                 result = basic_durable_agent.run_tool(
                     mock_ctx,
                     {
@@ -741,7 +745,9 @@ def test_create_tool_message_objects(self, basic_durable_agent):
 
             mock_ctx = Mock()
 
-            with patch.object(basic_durable_agent, "save_state"):
+            with patch.object(basic_durable_agent, "save_state"), patch.object(
+                basic_durable_agent, "load_state"
+            ):
                 result = basic_durable_agent.run_tool(
                     mock_ctx,
                     {
@@ -805,7 +811,9 @@ def test_tool_func(x):
         )
 
         # Mock save_state to prevent actual persistence
-        with patch.object(basic_durable_agent, "save_state"):
+        with patch.object(basic_durable_agent, "save_state"), patch.object(
+            basic_durable_agent, "load_state"
+        ):
             mock_ctx = Mock()
 
             # Call run_tool activity which appends messages and tool_history
@@ -892,8 +900,8 @@ def test_tool_func(x: str) -> str:
         assert basic_durable_agent.tool_history[0].tool_call_id == "call_123"
         assert basic_durable_agent.tool_history[0].tool_name == "TestToolFunc"
 
-    def test_construct_messages_with_instance_history(self, basic_durable_agent):
-        """Test _construct_messages_with_instance_history helper method."""
+    def test_reconstruct_conversation_history(self, basic_durable_agent):
+        """Test test_reconstruct_conversation_history helper method."""
         from datetime import datetime, timezone
 
         instance_id = "test-instance-123"
@@ -918,9 +926,7 @@ def test_construct_messages_with_instance_history(self, basic_durable_agent):
             start_time=datetime.now(timezone.utc),
         )
 
-        messages = basic_durable_agent._construct_messages_with_instance_history(
-            instance_id
-        )
+        messages = basic_durable_agent._reconstruct_conversation_history(instance_id)
 
         # Should include messages from instance history (system messages excluded from instance timeline)
         # Plus any messages from memory
diff --git a/tests/agents/durableagent/test_mcp_streamable_http.py b/tests/agents/durableagent/test_mcp_streamable_http.py