Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 118 additions & 1 deletion src/utils/json_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,97 @@ def fix_common_json_errors(content: str) -> str:
"""Fix common JSON syntax errors."""
content = re.sub(r':\s*=\s*"', ':"', content)
content = re.sub(r'(\w+):\s*"', r'"\1":"', content)
content = re.sub(r'\\(?!["\\/bfnrtu])', r"\\\\", content)

# Fix LaTeX backslashes: escape backslashes that are not part of valid JSON escape sequences
# Valid JSON escapes: \", \\, \/, \b, \f, \n, \r, \t, \uXXXX
# LaTeX uses \(, \), \[, \], \epsilon, \delta, etc. which need to be escaped as \\(, \\), etc.

# Process the content character by character to properly handle string boundaries
result = []
i = 0
in_string = False

while i < len(content):
char = content[i]

# Track when we enter/exit string values
if char == '"':
# Check if this quote is escaped by counting backslashes before it
# An escaped quote has an odd number of backslashes before it
backslash_count = 0
j = i - 1
while j >= 0 and content[j] == '\\':
backslash_count += 1
j -= 1

# If odd number of backslashes, the quote is escaped (part of string content)
if backslash_count % 2 == 1:
result.append(char)
i += 1
continue

# This is a real quote - toggle string state
in_string = not in_string
result.append(char)
i += 1
continue

# Handle backslashes inside string values
if char == '\\' and in_string:
if i + 1 < len(content):
next_char = content[i + 1]
# Check for valid JSON escape sequences
# For single-character escapes: ", \, /, b, f, n, r, t
# We need to ensure they're not part of a longer sequence (e.g., \to should not match \t)
if next_char in '"\\/':
# Always valid single-char escapes
result.append(char)
result.append(next_char)
i += 2
continue
elif next_char in 'bfnrt':
# Check if this is a complete escape (not part of longer sequence)
# Valid if followed by non-alphanumeric or end of string
if i + 2 >= len(content) or not (content[i + 2].isalnum() or content[i + 2] == '_'):
# Complete escape sequence (e.g., \t, \n, \r, \b, \f)
result.append(char)
result.append(next_char)
i += 2
continue
# Otherwise it's part of a longer sequence (e.g., \to, \lim) - escape it
result.append('\\\\')
result.append(next_char)
i += 2
continue
elif next_char == 'u' and i + 5 < len(content):
# Check for \uXXXX pattern
hex_part = content[i + 2:i + 6]
if all(c in '0123456789abcdefABCDEF' for c in hex_part):
# Valid \uXXXX escape
result.append(char)
result.append(next_char)
result.append(hex_part)
i += 6
continue

# Invalid escape sequence (like LaTeX \(, \), \[, \], \epsilon, \to, etc.)
# Double-escape the backslash
result.append('\\\\')
result.append(next_char)
i += 2
continue
else:
# Backslash at end of content - escape it
result.append('\\\\')
i += 1
continue

# Regular character
result.append(char)
i += 1

content = ''.join(result)

return re.sub(r",(\s*[}\]])", r"\1", content)


Expand All @@ -67,6 +157,33 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]:
log.error(f"Failed to parse JSON response: {e}")
log.error(f"Content length: {len(cleaned_content)} characters")

# Try to fix LaTeX backslash issues if the error is about invalid escape
if "Invalid \\escape" in str(e) or "Invalid escape" in str(e):
try:
log.warning("Attempting to fix LaTeX backslash escape issues")
# Apply more aggressive LaTeX backslash fixing
# Escape all backslashes in string values that aren't valid JSON escapes
fixed_content = cleaned_content

# Use regex to find and fix invalid escapes in string values
# This pattern finds backslashes in string values and escapes invalid ones
def fix_escapes_in_string(match):
"""Fix escapes within a JSON string value."""
string_content = match.group(1)
# Escape backslashes not followed by valid JSON escape chars
fixed = re.sub(r'\\(?!["\\/bfnrtu]|u[0-9a-fA-F]{4})', r'\\\\', string_content)
return f'"{fixed}"'

# Find string values and fix them
# Pattern: "([^"\\]|\\.)*" - matches JSON string values
fixed_content = re.sub(r'"((?:[^"\\]|\\.)*)"', fix_escapes_in_string, fixed_content)

result = json.loads(fixed_content)
log.info("Successfully fixed LaTeX backslash issues")
return result if isinstance(result, dict) else {}
except Exception as fix_error:
log.error(f"Failed to fix LaTeX escape issues: {fix_error}")

try:
if "Unterminated string" in str(e):
last_complete = cleaned_content.rfind('"},')
Expand Down
2 changes: 1 addition & 1 deletion src/utils/model_client_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
)


MAX_TOKENS = 1024 * 30
MAX_TOKENS = 1024 * 16 # Reduced to 16384 for gpt-4o compatibility

logger = logging.getLogger(__name__)

Expand Down
Loading