Account for encrypted reasoning for auto compaction (openai#7113)

aibrahim-oai · web-flow · commit b519267d0545 · 2025-11-22T03:06:45.000Z
- The total token used returned from the api doesn't account for the
reasoning items before the assistant message
- Account for those for auto compaction
- Add the encrypted reasoning effort in the common tests utils
- Add a test to make sure it works as expected
diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock
diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs
@@ -661,6 +661,11 @@ impl Session {
         format!("auto-compact-{id}")
     }
 
+    async fn get_total_token_usage(&self) -> i64 {
+        let state = self.state.lock().await;
+        state.get_total_token_usage()
+    }
+
     async fn record_initial_history(&self, conversation_history: InitialHistory) {
         let turn_context = self.new_turn(SessionSettingsUpdate::default()).await;
         match conversation_history {
@@ -1958,20 +1963,13 @@ pub(crate) async fn run_task(
         .await
         {
             Ok(turn_output) => {
-                let TurnRunResult {
-                    processed_items,
-                    total_token_usage,
-                } = turn_output;
+                let processed_items = turn_output;
                 let limit = turn_context
                     .client
                     .get_auto_compact_token_limit()
                     .unwrap_or(i64::MAX);
-                let total_usage_tokens = total_token_usage
-                    .as_ref()
-                    .map(TokenUsage::tokens_in_context_window);
-                let token_limit_reached = total_usage_tokens
-                    .map(|tokens| tokens >= limit)
-                    .unwrap_or(false);
+                let total_usage_tokens = sess.get_total_token_usage().await;
+                let token_limit_reached = total_usage_tokens >= limit;
                 let (responses, items_to_record_in_conversation_history) =
                     process_items(processed_items, &sess, &turn_context).await;
 
@@ -2028,7 +2026,7 @@ async fn run_turn(
     turn_diff_tracker: SharedTurnDiffTracker,
     input: Vec<ResponseItem>,
     cancellation_token: CancellationToken,
-) -> CodexResult<TurnRunResult> {
+) -> CodexResult<Vec<ProcessedResponseItem>> {
     let mcp_tools = sess
         .services
         .mcp_connection_manager
@@ -2159,12 +2157,6 @@ pub struct ProcessedResponseItem {
     pub response: Option<ResponseInputItem>,
 }
 
-#[derive(Debug)]
-struct TurnRunResult {
-    processed_items: Vec<ProcessedResponseItem>,
-    total_token_usage: Option<TokenUsage>,
-}
-
 #[allow(clippy::too_many_arguments)]
 async fn try_run_turn(
     router: Arc<ToolRouter>,
@@ -2173,7 +2165,7 @@ async fn try_run_turn(
     turn_diff_tracker: SharedTurnDiffTracker,
     prompt: &Prompt,
     cancellation_token: CancellationToken,
-) -> CodexResult<TurnRunResult> {
+) -> CodexResult<Vec<ProcessedResponseItem>> {
     let rollout_item = RolloutItem::TurnContext(TurnContextItem {
         cwd: turn_context.cwd.clone(),
         approval_policy: turn_context.approval_policy,
@@ -2335,12 +2327,7 @@ async fn try_run_turn(
                     sess.send_event(&turn_context, msg).await;
                 }
 
-                let result = TurnRunResult {
-                    processed_items,
-                    total_token_usage: token_usage.clone(),
-                };
-
-                return Ok(result);
+                return Ok(processed_items);
             }
             ResponseEvent::OutputTextDelta(delta) => {
                 // In review child threads, suppress assistant text deltas; the
diff --git a/codex-rs/core/src/context_manager/history.rs b/codex-rs/core/src/context_manager/history.rs
@@ -2,6 +2,7 @@ use crate::codex::TurnContext;
 use crate::context_manager::normalize;
 use crate::truncate::TruncationPolicy;
 use crate::truncate::approx_token_count;
+use crate::truncate::approx_tokens_from_byte_count;
 use crate::truncate::truncate_function_output_items_with_policy;
 use crate::truncate::truncate_text;
 use codex_protocol::models::FunctionCallOutputPayload;
@@ -119,6 +120,54 @@ impl ContextManager {
         );
     }
 
+    fn get_non_last_reasoning_items_tokens(&self) -> usize {
+        // get reasoning items excluding all the ones after the last user message
+        let Some(last_user_index) = self
+            .items
+            .iter()
+            .rposition(|item| matches!(item, ResponseItem::Message { role, .. } if role == "user"))
+        else {
+            return 0usize;
+        };
+
+        let total_reasoning_bytes = self
+            .items
+            .iter()
+            .take(last_user_index)
+            .filter_map(|item| {
+                if let ResponseItem::Reasoning {
+                    encrypted_content: Some(content),
+                    ..
+                } = item
+                {
+                    Some(content.len())
+                } else {
+                    None
+                }
+            })
+            .map(Self::estimate_reasoning_length)
+            .fold(0usize, usize::saturating_add);
+
+        let token_estimate = approx_tokens_from_byte_count(total_reasoning_bytes);
+        token_estimate as usize
+    }
+
+    fn estimate_reasoning_length(encoded_len: usize) -> usize {
+        encoded_len
+            .saturating_mul(3)
+            .checked_div(4)
+            .unwrap_or(0)
+            .saturating_sub(650)
+    }
+
+    pub(crate) fn get_total_token_usage(&self) -> i64 {
+        self.token_info
+            .as_ref()
+            .map(|info| info.last_token_usage.total_tokens)
+            .unwrap_or(0)
+            .saturating_add(self.get_non_last_reasoning_items_tokens() as i64)
+    }
+
     /// This function enforces a couple of invariants on the in-memory history:
     /// 1. every call (function/custom) has a corresponding output entry
     /// 2. every output has a corresponding call entry
diff --git a/codex-rs/core/src/context_manager/history_tests.rs b/codex-rs/core/src/context_manager/history_tests.rs
@@ -56,6 +56,17 @@ fn reasoning_msg(text: &str) -> ResponseItem {
     }
 }
 
+fn reasoning_with_encrypted_content(len: usize) -> ResponseItem {
+    ResponseItem::Reasoning {
+        id: String::new(),
+        summary: vec![ReasoningItemReasoningSummary::SummaryText {
+            text: "summary".to_string(),
+        }],
+        content: None,
+        encrypted_content: Some("a".repeat(len)),
+    }
+}
+
 fn truncate_exec_output(content: &str) -> String {
     truncate::truncate_text(content, TruncationPolicy::Tokens(EXEC_FORMAT_MAX_TOKENS))
 }
@@ -112,6 +123,28 @@ fn filters_non_api_messages() {
     );
 }
 
+#[test]
+fn non_last_reasoning_tokens_return_zero_when_no_user_messages() {
+    let history = create_history_with_items(vec![reasoning_with_encrypted_content(800)]);
+
+    assert_eq!(history.get_non_last_reasoning_items_tokens(), 0);
+}
+
+#[test]
+fn non_last_reasoning_tokens_ignore_entries_after_last_user() {
+    let history = create_history_with_items(vec![
+        reasoning_with_encrypted_content(900),
+        user_msg("first"),
+        reasoning_with_encrypted_content(1_000),
+        user_msg("second"),
+        reasoning_with_encrypted_content(2_000),
+    ]);
+    // first: (900 * 0.75 - 650) / 4 = 6.25 tokens
+    // second: (1000 * 0.75 - 650) / 4 = 25 tokens
+    // first + second = 62.5
+    assert_eq!(history.get_non_last_reasoning_items_tokens(), 32);
+}
+
 #[test]
 fn get_history_for_prompt_drops_ghost_commits() {
     let items = vec![ResponseItem::GhostSnapshot {
diff --git a/codex-rs/core/src/state/session.rs b/codex-rs/core/src/state/session.rs
@@ -74,4 +74,8 @@ impl SessionState {
     pub(crate) fn set_token_usage_full(&mut self, context_window: i64) {
         self.history.set_token_usage_full(context_window);
     }
+
+    pub(crate) fn get_total_token_usage(&self) -> i64 {
+        self.history.get_total_token_usage()
+    }
 }
diff --git a/codex-rs/core/src/truncate.rs b/codex-rs/core/src/truncate.rs
@@ -296,7 +296,7 @@ fn approx_bytes_for_tokens(tokens: usize) -> usize {
     tokens.saturating_mul(APPROX_BYTES_PER_TOKEN)
 }
 
-fn approx_tokens_from_byte_count(bytes: usize) -> u64 {
+pub(crate) fn approx_tokens_from_byte_count(bytes: usize) -> u64 {
     let bytes_u64 = bytes as u64;
     bytes_u64.saturating_add((APPROX_BYTES_PER_TOKEN as u64).saturating_sub(1))
         / (APPROX_BYTES_PER_TOKEN as u64)
diff --git a/codex-rs/core/tests/common/Cargo.toml b/codex-rs/core/tests/common/Cargo.toml
@@ -9,6 +9,7 @@ path = "lib.rs"
 [dependencies]
 anyhow = { workspace = true }
 assert_cmd = { workspace = true }
+base64 = { workspace = true }
 codex-core = { workspace = true }
 codex-protocol = { workspace = true }
 notify = { workspace = true }
diff --git a/codex-rs/core/tests/common/responses.rs b/codex-rs/core/tests/common/responses.rs
@@ -2,6 +2,7 @@ use std::sync::Arc;
 use std::sync::Mutex;
 
 use anyhow::Result;
+use base64::Engine;
 use serde_json::Value;
 use wiremock::BodyPrintLimit;
 use wiremock::Match;
@@ -297,12 +298,18 @@ pub fn ev_reasoning_item(id: &str, summary: &[&str], raw_content: &[&str]) -> Va
         .map(|text| serde_json::json!({"type": "summary_text", "text": text}))
         .collect();
 
+    let overhead = "b".repeat(550);
+    let raw_content_joined = raw_content.join("");
+    let encrypted_content =
+        base64::engine::general_purpose::STANDARD.encode(overhead + raw_content_joined.as_str());
+
     let mut event = serde_json::json!({
         "type": "response.output_item.done",
         "item": {
             "type": "reasoning",
             "id": id,
             "summary": summary_entries,
+            "encrypted_content": encrypted_content,
         }
     });
 
diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs

Original file line number	Diff line number	Diff line change
`@@ -74,4 +74,8 @@ impl SessionState {`
`74`	`74`	`pub(crate) fn set_token_usage_full(&mut self, context_window: i64) {`
`75`	`75`	`self.history.set_token_usage_full(context_window);`
`76`	`76`	`}`
	`77`	`+`
	`78`	`+ pub(crate) fn get_total_token_usage(&self) -> i64 {`
	`79`	`+ self.history.get_total_token_usage()`
	`80`	`+ }`
`77`	`81`	`}`
Original file line number	Diff line number	Diff line change
`@@ -296,7 +296,7 @@ fn approx_bytes_for_tokens(tokens: usize) -> usize {`
`296`	`296`	`tokens.saturating_mul(APPROX_BYTES_PER_TOKEN)`
`297`	`297`	`}`
`298`	`298`
`299`		`-fn approx_tokens_from_byte_count(bytes: usize) -> u64 {`
	`299`	`+pub(crate) fn approx_tokens_from_byte_count(bytes: usize) -> u64 {`
`300`	`300`	`let bytes_u64 = bytes as u64;`
`301`	`301`	`bytes_u64.saturating_add((APPROX_BYTES_PER_TOKEN as u64).saturating_sub(1))`
`302`	`302`	`/ (APPROX_BYTES_PER_TOKEN as u64)`