From d2afa6cec361c87627ba536cc7b7cd3829a57c65 Mon Sep 17 00:00:00 2001 From: Ali Mehrabian <69728416+Ali-Meh619@users.noreply.github.com> Date: Mon, 10 Nov 2025 14:42:32 -0800 Subject: [PATCH 1/3] max_token $ json utils 1) Fixed max_tokens limit in src/utils/model_client_utils.py: reduced from 30,720 to 16,384 to match gpt-4o's limit, preventing API errors. 2) Fixed JSON parsing in src/utils/json_utils.py: rewrote fix_common_json_errors() to handle LaTeX backslashes (\(, \), \to, \lim, etc.) by escaping them correctly, and added a fallback in the exception handler. This resolved JSON parsing errors in task generation and problem solving. --- src/utils/json_utils.py | 119 +++++++++++++++++++++++++++++++- src/utils/model_client_utils.py | 2 +- 2 files changed, 119 insertions(+), 2 deletions(-) diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py index 3d2fd77..435d087 100644 --- a/src/utils/json_utils.py +++ b/src/utils/json_utils.py @@ -44,7 +44,97 @@ def fix_common_json_errors(content: str) -> str: """Fix common JSON syntax errors.""" content = re.sub(r':\s*=\s*"', ':"', content) content = re.sub(r'(\w+):\s*"', r'"\1":"', content) - content = re.sub(r'\\(?!["\\/bfnrtu])', r"\\\\", content) + + # Fix LaTeX backslashes: escape backslashes that are not part of valid JSON escape sequences + # Valid JSON escapes: \", \\, \/, \b, \f, \n, \r, \t, \uXXXX + # LaTeX uses \(, \), \[, \], \epsilon, \delta, etc. which need to be escaped as \\(, \\), etc. + + # Process the content character by character to properly handle string boundaries + result = [] + i = 0 + in_string = False + + while i < len(content): + char = content[i] + + # Track when we enter/exit string values + if char == '"': + # Check if this quote is escaped by counting backslashes before it + # An escaped quote has an odd number of backslashes before it + backslash_count = 0 + j = i - 1 + while j >= 0 and content[j] == '\\': + backslash_count += 1 + j -= 1 + + # If odd number of backslashes, the quote is escaped (part of string content) + if backslash_count % 2 == 1: + result.append(char) + i += 1 + continue + + # This is a real quote - toggle string state + in_string = not in_string + result.append(char) + i += 1 + continue + + # Handle backslashes inside string values + if char == '\\' and in_string: + if i + 1 < len(content): + next_char = content[i + 1] + # Check for valid JSON escape sequences + # For single-character escapes: ", \, /, b, f, n, r, t + # We need to ensure they're not part of a longer sequence (e.g., \to should not match \t) + if next_char in '"\\/': + # Always valid single-char escapes + result.append(char) + result.append(next_char) + i += 2 + continue + elif next_char in 'bfnrt': + # Check if this is a complete escape (not part of longer sequence) + # Valid if followed by non-alphanumeric or end of string + if i + 2 >= len(content) or not (content[i + 2].isalnum() or content[i + 2] == '_'): + # Complete escape sequence (e.g., \t, \n, \r, \b, \f) + result.append(char) + result.append(next_char) + i += 2 + continue + # Otherwise it's part of a longer sequence (e.g., \to, \lim) - escape it + result.append('\\\\') + result.append(next_char) + i += 2 + continue + elif next_char == 'u' and i + 5 < len(content): + # Check for \uXXXX pattern + hex_part = content[i + 2:i + 6] + if all(c in '0123456789abcdefABCDEF' for c in hex_part): + # Valid \uXXXX escape + result.append(char) + result.append(next_char) + result.append(hex_part) + i += 6 + continue + + # Invalid escape sequence (like LaTeX \(, \), \[, \], \epsilon, \to, etc.) + # Double-escape the backslash + result.append('\\\\') + result.append(next_char) + i += 2 + continue + else: + # Backslash at end of content - escape it + result.append('\\\\') + i += 1 + continue + + # Regular character + result.append(char) + i += 1 + + content = ''.join(result) + return re.sub(r",(\s*[}\]])", r"\1", content) @@ -67,6 +157,33 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]: log.error(f"Failed to parse JSON response: {e}") log.error(f"Content length: {len(cleaned_content)} characters") + # Try to fix LaTeX backslash issues if the error is about invalid escape + if "Invalid \\escape" in str(e) or "Invalid escape" in str(e): + try: + log.warning("Attempting to fix LaTeX backslash escape issues") + # Apply more aggressive LaTeX backslash fixing + # Escape all backslashes in string values that aren't valid JSON escapes + fixed_content = cleaned_content + + # Use regex to find and fix invalid escapes in string values + # This pattern finds backslashes in string values and escapes invalid ones + def fix_escapes_in_string(match): + """Fix escapes within a JSON string value.""" + string_content = match.group(1) + # Escape backslashes not followed by valid JSON escape chars + fixed = re.sub(r'\\(?!["\\/bfnrtu]|u[0-9a-fA-F]{4})', r'\\\\', string_content) + return f'"{fixed}"' + + # Find string values and fix them + # Pattern: "([^"\\]|\\.)*" - matches JSON string values + fixed_content = re.sub(r'"((?:[^"\\]|\\.)*)"', fix_escapes_in_string, fixed_content) + + result = json.loads(fixed_content) + log.info("Successfully fixed LaTeX backslash issues") + return result if isinstance(result, dict) else {} + except Exception as fix_error: + log.error(f"Failed to fix LaTeX escape issues: {fix_error}") + try: if "Unterminated string" in str(e): last_complete = cleaned_content.rfind('"},') diff --git a/src/utils/model_client_utils.py b/src/utils/model_client_utils.py index c8c2ef6..e5486f6 100644 --- a/src/utils/model_client_utils.py +++ b/src/utils/model_client_utils.py @@ -20,7 +20,7 @@ ) -MAX_TOKENS = 1024 * 30 +MAX_TOKENS = 1024 * 16 # Reduced to 16384 for gpt-4o compatibility logger = logging.getLogger(__name__) From 16765637c1298a71d0457772803aa32a0073bab6 Mon Sep 17 00:00:00 2001 From: Ali Mehrabian <69728416+Ali-Meh619@users.noreply.github.com> Date: Tue, 11 Nov 2025 10:44:51 -0800 Subject: [PATCH 2/3] Create test_json_utils.py Create Unit test for json parsing --- tests/src/test_json_utils.py | 184 +++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 tests/src/test_json_utils.py diff --git a/tests/src/test_json_utils.py b/tests/src/test_json_utils.py new file mode 100644 index 0000000..b2f645d --- /dev/null +++ b/tests/src/test_json_utils.py @@ -0,0 +1,184 @@ +""" +Tests for the JSON utility functions related to LaTeX backslash handling. + +This module contains tests specifically for the changes made to handle LaTeX +expressions with backslashes in JSON strings. The main changes were: +- fix_common_json_errors: Character-by-character processing to correctly escape + LaTeX backslashes while preserving valid JSON escape sequences +- parse_llm_json_response: Fallback mechanism for "Invalid escape" errors + +The tests verify that: +- LaTeX expressions with backslashes (e.g., \\(, \\), \\[, \\], \\to, \\lim) are properly escaped +- Valid JSON escape sequences (e.g., \\n, \\t, \\", \\\\) are preserved +- The fallback mechanism correctly handles invalid escape errors +""" + +import json + +import pytest + +from src.utils.json_utils import ( + fix_common_json_errors, + parse_llm_json_response, +) + + +class TestFixCommonJsonErrorsLaTeX: + """Test cases for fix_common_json_errors function - LaTeX backslash handling.""" + + def test_latex_parentheses_escaped(self): + """Test that LaTeX parentheses are properly escaped.""" + content = '{"thought": "The equation \\(x + y\\) is important"}' + result = fix_common_json_errors(content) + # Should double-escape: \\( becomes \\\\( + assert '\\\\(x + y\\\\)' in result or '\\\\(' in result + # Verify it can be parsed + parsed = json.loads(result) + assert "(" in parsed["thought"] or "\\(" in parsed["thought"] + + def test_latex_brackets_escaped(self): + """Test that LaTeX brackets are properly escaped.""" + content = '{"thought": "The equation \\[x + y = z\\] is important"}' + result = fix_common_json_errors(content) + # Should double-escape: \\[ becomes \\\\[ + assert '\\\\[' in result or '\\[' in result + # Verify it can be parsed + parsed = json.loads(result) + assert "[" in parsed["thought"] or "\\[" in parsed["thought"] + + def test_latex_commands_escaped(self): + """Test that LaTeX commands like \\to, \\lim are properly escaped.""" + content = '{"thought": "As x \\to 0, we have \\lim f(x)"}' + result = fix_common_json_errors(content) + # Should escape \to and \lim + assert '\\to' in result or '\\\\to' in result + # Verify it can be parsed + parsed = json.loads(result) + assert "to" in parsed["thought"] or "\\to" in parsed["thought"] + + def test_valid_json_escapes_preserved(self): + """Test that valid JSON escape sequences are preserved.""" + # Use raw string to ensure \n and \t are literal backslash+n and backslash+t + content = r'{"text": "Line 1\nLine 2\tTabbed"}' + result = fix_common_json_errors(content) + # Valid escapes should remain (as single backslash sequences) + assert r'\n' in result or '\n' in result + assert r'\t' in result or '\t' in result + # Verify it can be parsed and produces actual newline/tab characters + parsed = json.loads(result) + assert "\n" in parsed["text"] or "\\n" in parsed["text"] + assert "\t" in parsed["text"] or "\\t" in parsed["text"] + + def test_valid_json_quote_escape_preserved(self): + """Test that escaped quotes are preserved.""" + content = '{"text": "He said \\"hello\\""}' + result = fix_common_json_errors(content) + # Escaped quotes should remain + assert '\\"' in result + # Verify it can be parsed + parsed = json.loads(result) + assert '"' in parsed["text"] + + def test_valid_json_backslash_escape_preserved(self): + """Test that escaped backslashes are preserved.""" + content = '{"path": "C:\\\\Users\\\\file.txt"}' + result = fix_common_json_errors(content) + # Escaped backslashes should remain + assert '\\\\' in result + # Verify it can be parsed + parsed = json.loads(result) + assert "\\" in parsed["path"] + + def test_latex_in_nested_string(self): + """Test LaTeX handling in nested JSON strings.""" + content = '{"outer": {"inner": "The formula \\(a + b\\) is key"}}' + result = fix_common_json_errors(content) + # Should handle LaTeX in nested strings + parsed = json.loads(result) + assert "(" in parsed["outer"]["inner"] or "\\(" in parsed["outer"]["inner"] + + def test_mixed_valid_and_invalid_escapes(self): + """Test handling of mixed valid and invalid escape sequences.""" + # Use raw string to ensure proper escape handling + content = r'{"text": "Valid\nNewline and invalid\to LaTeX"}' + result = fix_common_json_errors(content) + # Valid escape should remain, invalid should be escaped + assert r'\n' in result or '\n' in result + # Verify it can be parsed + parsed = json.loads(result) + # Should contain newline character or escaped newline + assert "\n" in parsed["text"] or "\\n" in parsed["text"] + + +class TestParseLlmJsonResponseLaTeX: + """Test cases for parse_llm_json_response function - LaTeX and fallback handling.""" + + def test_parse_json_with_latex(self): + """Test parsing JSON containing LaTeX expressions.""" + content = '{"thought": "The equation \\(x + y\\) is important", "answer": "42"}' + result = parse_llm_json_response(content) + assert "thought" in result + assert "answer" in result + # The thought should contain the LaTeX (possibly escaped) + assert "x + y" in result["thought"] or "(" in result["thought"] + + def test_parse_json_with_latex_complex(self): + """Test parsing complex JSON with multiple LaTeX expressions.""" + content = ( + '{"thought": "We need to solve \\[x^2 + 2x + 1 = 0\\] ' + 'and find \\lim_{x \\to 0} f(x)", "answer": "x = -1"}' + ) + result = parse_llm_json_response(content) + assert "thought" in result + assert "answer" in result + assert "x = -1" in result["answer"] + + def test_parse_json_fallback_on_invalid_escape(self): + """Test that fallback mechanism works for invalid escape errors.""" + # This should trigger the fallback mechanism + content = '{"text": "Invalid escape\\to here"}' + result = parse_llm_json_response(content) + assert "text" in result + # Should successfully parse after fallback fix + + def test_parse_json_with_latex_in_multiple_fields(self): + """Test parsing JSON with LaTeX in multiple fields.""" + content = ( + '{"thought": "Consider \\(a + b\\)", ' + '"solution": "The answer is \\[x = 5\\]", ' + '"reasoning": "Using \\lim_{x \\to 0}"}' + ) + result = parse_llm_json_response(content) + assert "thought" in result + assert "solution" in result + assert "reasoning" in result + + +class TestLaTeXEdgeCases: + """Test edge cases for LaTeX backslash handling.""" + + def test_latex_at_string_boundary(self): + """Test LaTeX expressions at string boundaries.""" + content = '{"text": "\\(start\\) and \\(end\\)"}' + result = parse_llm_json_response(content) + assert "text" in result + + def test_latex_with_numbers(self): + """Test LaTeX expressions containing numbers.""" + content = '{"formula": "\\[x^2 + 2x + 1\\]"}' + result = parse_llm_json_response(content) + assert "formula" in result + + def test_escaped_quotes_with_latex(self): + """Test combination of escaped quotes and LaTeX.""" + content = '{"text": "He said \\"Consider \\(x + y\\)\\""}' + result = parse_llm_json_response(content) + assert "text" in result + assert '"' in result["text"] or "Consider" in result["text"] + + def test_newline_characters_in_latex(self): + """Test newline characters near LaTeX expressions.""" + content = '{"text": "Line 1\\n\\(x + y\\)\\nLine 2"}' + result = parse_llm_json_response(content) + assert "text" in result + assert "\n" in result["text"] From ca651c748b92500cf0458160251ed138c0d4022d Mon Sep 17 00:00:00 2001 From: Ali Mehrabian <69728416+Ali-Meh619@users.noreply.github.com> Date: Tue, 11 Nov 2025 10:47:56 -0800 Subject: [PATCH 3/3] Update json_utils.py Removed all trailing whitespace from json_utils.py --- src/utils/json_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py index 435d087..db962bc 100644 --- a/src/utils/json_utils.py +++ b/src/utils/json_utils.py @@ -44,16 +44,16 @@ def fix_common_json_errors(content: str) -> str: """Fix common JSON syntax errors.""" content = re.sub(r':\s*=\s*"', ':"', content) content = re.sub(r'(\w+):\s*"', r'"\1":"', content) - + # Fix LaTeX backslashes: escape backslashes that are not part of valid JSON escape sequences # Valid JSON escapes: \", \\, \/, \b, \f, \n, \r, \t, \uXXXX # LaTeX uses \(, \), \[, \], \epsilon, \delta, etc. which need to be escaped as \\(, \\), etc. - + # Process the content character by character to properly handle string boundaries result = [] i = 0 in_string = False - + while i < len(content): char = content[i] @@ -116,7 +116,7 @@ def fix_common_json_errors(content: str) -> str: result.append(hex_part) i += 6 continue - + # Invalid escape sequence (like LaTeX \(, \), \[, \], \epsilon, \to, etc.) # Double-escape the backslash result.append('\\\\') @@ -132,9 +132,9 @@ def fix_common_json_errors(content: str) -> str: # Regular character result.append(char) i += 1 - + content = ''.join(result) - + return re.sub(r",(\s*[}\]])", r"\1", content) @@ -164,7 +164,7 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]: # Apply more aggressive LaTeX backslash fixing # Escape all backslashes in string values that aren't valid JSON escapes fixed_content = cleaned_content - + # Use regex to find and fix invalid escapes in string values # This pattern finds backslashes in string values and escapes invalid ones def fix_escapes_in_string(match):