manavgup
diff --git a/‎backend/rag_solution/services/podcast_service.py‎
Lines changed: 40 additions & 20 deletions b/‎backend/rag_solution/services/podcast_service.py‎
Lines changed: 40 additions & 20 deletions
diff --git a/‎backend/rag_solution/utils/podcast_script_parser.py‎
Lines changed: 19 additions & 0 deletions b/‎backend/rag_solution/utils/podcast_script_parser.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎backend/rag_solution/utils/transcript_formatter.py‎
Lines changed: 1 addition & 3 deletions b/‎backend/rag_solution/utils/transcript_formatter.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎migrations/add_chapters_to_podcasts.sql‎
Lines changed: 16 additions & 0 deletions b/‎migrations/add_chapters_to_podcasts.sql‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎migrations/apply_chapters_migration.py‎
Lines changed: 128 additions & 0 deletions b/‎migrations/apply_chapters_migration.py‎
Lines changed: 128 additions & 0 deletions
@@ -15,6 +15,7 @@
 """
 
 import logging
+import time
 from enum import Enum
 from typing import Any, ClassVar
 
@@ -412,15 +413,7 @@ async def _process_podcast_generation(
             audio_stored = True  # Mark audio as stored for cleanup if needed
 
             # Step 6: Extract and serialize chapters
-            chapters_dict = [
-                {
-                    "title": chapter.title,
-                    "start_time": chapter.start_time,
-                    "end_time": chapter.end_time,
-                    "word_count": chapter.word_count,
-                }
-                for chapter in podcast_script.chapters
-            ]
+            chapters_dict = self._serialize_chapters(podcast_script)
 
             # Step 7: Mark complete (100%)
             self.repository.mark_completed(
@@ -744,15 +737,22 @@ async def _generate_script(self, podcast_input: PodcastGenerationInput, rag_resu
         # Initialize enhanced parser for quality validation
         enhanced_parser = EnhancedScriptParser(average_wpm=150)
 
-        # Retry configuration
-        max_retries = 3
+        # Retry configuration (optimized for cost and latency)
+        max_retries = 2  # Reduced from 3 to 2 (saves ~30s latency, $0.01-0.05 cost)
         min_quality_score = 0.6
+        base_delay = 1.0  # Base delay for exponential backoff (seconds)
 
         best_script = None
         best_quality = 0.0
 
         for attempt in range(max_retries):
             try:
+                # Add exponential backoff between retries (2^attempt * base_delay)
+                if attempt > 0:
+                    delay = base_delay * (2**attempt)
+                    logger.info("Retry attempt %d: waiting %.1fs before retry", attempt + 1, delay)
+                    time.sleep(delay)
+
                 script_text = llm_provider.generate_text(
                     user_id=user_id,
                     prompt="",  # Empty - template contains full prompt
@@ -807,6 +807,10 @@ async def _generate_script(self, podcast_input: PodcastGenerationInput, rag_resu
                 logger.error("Error generating script on attempt %d: %s", attempt + 1, e)
                 if attempt == max_retries - 1:
                     raise
+                # Add exponential backoff on errors as well
+                delay = base_delay * (2 ** (attempt + 1))
+                logger.info("Error recovery: waiting %.1fs before retry", delay)
+                time.sleep(delay)
 
         # If we exhausted retries, return best script with warning
         if best_script:
@@ -1177,6 +1181,30 @@ async def _update_progress(
                 status=status,
             )
 
+    def _serialize_chapters(self, podcast_script: PodcastScriptOutput) -> list[dict[str, Any]]:
+        """
+        Serialize podcast chapters from PodcastScriptOutput to dictionary format.
+
+        Args:
+            podcast_script: Parsed podcast script with chapters
+
+        Returns:
+            List of chapter dictionaries with title, timestamps, and word count.
+            Returns empty list if chapters is None or empty.
+        """
+        if not podcast_script.chapters:
+            return []
+
+        return [
+            {
+                "title": chapter.title,
+                "start_time": chapter.start_time,
+                "end_time": chapter.end_time,
+                "word_count": chapter.word_count,
+            }
+            for chapter in podcast_script.chapters
+        ]
+
     async def get_podcast(self, podcast_id: UUID4, user_id: UUID4) -> PodcastGenerationOutput:
         """
         Get podcast by ID with access control.
@@ -1561,15 +1589,7 @@ async def _process_audio_from_script(
             )
 
             # Step 5: Extract and serialize chapters
-            chapters_dict = [
-                {
-                    "title": chapter.title,
-                    "start_time": chapter.start_time,
-                    "end_time": chapter.end_time,
-                    "word_count": chapter.word_count,
-                }
-                for chapter in podcast_script.chapters
-            ]
+            chapters_dict = self._serialize_chapters(podcast_script)
 
             # Step 6: Mark completed
             self.repository.mark_completed(
 
@@ -49,6 +49,9 @@ def is_acceptable(self, min_score: float = 0.6) -> bool:
 class PodcastScriptParser:
     """Parser for LLM-generated podcast scripts with quality validation."""
 
+    # Maximum input length to prevent ReDoS attacks (100KB = ~15,000-20,000 words)
+    MAX_INPUT_LENGTH: ClassVar[int] = 100_000
+
     # Artifact patterns that indicate prompt leakage
     ARTIFACT_PATTERNS: ClassVar[list[str]] = [
         r"Word count:\s*\d+",  # "Word count: 3,200"
@@ -103,7 +106,23 @@ def parse_script(self, llm_output: str, expected_word_count: int = 0) -> ScriptP
 
         Returns:
             ScriptParseResult with extracted script and quality metrics
+
+        Raises:
+            ValueError: If input length exceeds MAX_INPUT_LENGTH (ReDoS mitigation)
         """
+        # ReDoS mitigation: Validate input length before regex operations
+        if len(llm_output) > self.MAX_INPUT_LENGTH:
+            logger.error(
+                "Input length %d exceeds maximum %d (ReDoS mitigation)",
+                len(llm_output),
+                self.MAX_INPUT_LENGTH,
+            )
+            raise ValueError(
+                f"Input too large: {len(llm_output)} bytes "
+                f"(max: {self.MAX_INPUT_LENGTH} bytes). "
+                "This protects against ReDoS attacks."
+            )
+
         # Try each parsing strategy in order
         strategies = [
             (self._parse_xml_tags, ParsingStrategy.XML_TAGS),
 
@@ -86,7 +86,6 @@ def to_txt(
         transcript: str,
         title: str | None = None,
         duration_seconds: float | None = None,
-        chapters: list[PodcastChapter] | None = None,  # noqa: ARG002
     ) -> str:
         """
         Convert transcript to plain text format.
@@ -104,7 +103,6 @@ def to_txt(
             transcript: Raw podcast transcript
             title: Optional podcast title
             duration_seconds: Optional duration in seconds
-            chapters: Optional list of chapters (not used in plain text)
 
         Returns:
             Formatted plain text transcript
@@ -250,7 +248,7 @@ def format_transcript(
             ValueError: If format_type is unsupported
         """
         if format_type == TranscriptFormat.TXT:
-            return self.to_txt(transcript, title, duration_seconds, chapters)
+            return self.to_txt(transcript, title, duration_seconds)
         if format_type == TranscriptFormat.MARKDOWN:
             return self.to_markdown(transcript, title, duration_seconds, chapters)
         raise ValueError(f"Unsupported format: {format_type}")
@@ -0,0 +1,16 @@
+-- Migration: Add chapters column to podcasts table
+-- Date: 2025-11-10
+-- Issue: #602
+-- Description: Add chapters JSON column to store dynamic chapter markers with timestamps
+
+-- Add chapters column (nullable, defaults to empty array)
+ALTER TABLE podcasts
+ADD COLUMN IF NOT EXISTS chapters JSONB DEFAULT '[]'::jsonb;
+
+-- Add comment
+COMMENT ON COLUMN podcasts.chapters IS 'Dynamic chapter markers with timestamps (title, start_time, end_time, word_count)';
+
+-- Verify the column was added
+SELECT column_name, data_type, is_nullable, column_default
+FROM information_schema.columns
+WHERE table_name = 'podcasts' AND column_name = 'chapters';
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+"""
+Apply migration to add chapters column to podcasts table.
+
+Usage:
+    python migrations/apply_chapters_migration.py
+"""
+
+import os
+import sys
+from pathlib import Path
+
+import psycopg2
+from dotenv import load_dotenv
+
+# Add backend to path
+backend_path = Path(__file__).parent.parent / "backend"
+sys.path.insert(0, str(backend_path))
+
+# Load environment variables
+load_dotenv()
+
+# Database connection parameters
+DB_HOST = os.getenv("COLLECTIONDB_HOST", "localhost")
+DB_PORT = os.getenv("COLLECTIONDB_PORT", "5432")
+DB_USER = os.getenv("COLLECTIONDB_USER", "rag_modulo_user")
+DB_PASSWORD = os.getenv("COLLECTIONDB_PASSWORD")
+DB_NAME = os.getenv("COLLECTIONDB_NAME", "rag_modulo")
+
+
+def apply_migration():
+    """Apply the chapters column migration."""
+    print(f"Connecting to database: {DB_NAME} at {DB_HOST}:{DB_PORT}")
+
+    conn = None
+    cursor = None
+
+    try:
+        # Connect to database
+        conn = psycopg2.connect(
+            host=DB_HOST, port=DB_PORT, user=DB_USER, password=DB_PASSWORD, database=DB_NAME
+        )
+        cursor = conn.cursor()
+
+        print("Connected successfully!")
+
+        # Check if column already exists
+        cursor.execute(
+            """
+            SELECT column_name
+            FROM information_schema.columns
+            WHERE table_name = 'podcasts' AND column_name = 'chapters';
+        """
+        )
+
+        if cursor.fetchone():
+            print("✅ Column 'chapters' already exists in podcasts table.")
+        else:
+            print("Adding 'chapters' column to podcasts table...")
+
+            # Add the column
+            cursor.execute(
+                """
+                ALTER TABLE podcasts
+                ADD COLUMN chapters JSONB DEFAULT '[]'::jsonb;
+            """
+            )
+
+            # Add comment
+            cursor.execute(
+                """
+                COMMENT ON COLUMN podcasts.chapters IS
+                'Dynamic chapter markers with timestamps (title, start_time, end_time, word_count)';
+            """
+            )
+
+            print("✅ Successfully added 'chapters' column!")
+
+        # Verify the column
+        cursor.execute(
+            """
+            SELECT column_name, data_type, is_nullable, column_default
+            FROM information_schema.columns
+            WHERE table_name = 'podcasts' AND column_name = 'chapters';
+        """
+        )
+
+        result = cursor.fetchone()
+        if result:
+            print(f"\nColumn details:")
+            print(f"  Name: {result[0]}")
+            print(f"  Type: {result[1]}")
+            print(f"  Nullable: {result[2]}")
+            print(f"  Default: {result[3]}")
+        else:
+            print("❌ ERROR: Column 'chapters' not found after migration!")
+            if conn:
+                conn.rollback()
+            return False
+
+        # Commit transaction if all successful
+        conn.commit()
+
+        print("\n🎉 Migration completed successfully!")
+        return True
+
+    except psycopg2.Error as e:
+        print(f"❌ Database error: {e}")
+        if conn:
+            conn.rollback()
+            print("  Transaction rolled back.")
+        return False
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        if conn:
+            conn.rollback()
+            print("  Transaction rolled back.")
+        return False
+    finally:
+        if cursor:
+            cursor.close()
+        if conn:
+            conn.close()
+
+
+if __name__ == "__main__":
+    success = apply_migration()
+    sys.exit(0 if success else 1)