From b8a5748a6e472e02119e505a7d1c60a317ef53d2 Mon Sep 17 00:00:00 2001 From: Chien Yuan Chang Date: Wed, 19 Nov 2025 14:21:38 -0800 Subject: [PATCH] docs: review notebooks/content_extraction.ipynb --- notebooks/content_extraction.ipynb | 103 ++++++++++++++--------------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/notebooks/content_extraction.ipynb b/notebooks/content_extraction.ipynb index dc69346..565a397 100644 --- a/notebooks/content_extraction.ipynb +++ b/notebooks/content_extraction.ipynb @@ -19,7 +19,7 @@ "metadata": {}, "source": [ "## Prerequisites\n", - "1. Ensure your Azure AI service is configured by following the [configuration steps](../README.md#configure-azure-ai-service-resource).\n", + "1. Please ensure your Azure AI service is configured by following the [configuration steps](../README.md#configure-azure-ai-service-resource).\n", "2. Install the required packages to run this sample." ] }, @@ -40,12 +40,12 @@ "\n", "> The [AzureContentUnderstandingClient](../python/content_understanding_client.py) is a utility class that provides functions to interact with the Content Understanding API. Prior to the official release of the Content Understanding SDK, it serves as a lightweight SDK.\n", ">\n", - "> Fill in the constants **AZURE_AI_ENDPOINT**, **AZURE_AI_API_VERSION**, and **AZURE_AI_API_KEY** with the details from your Azure AI Service.\n", + "> Please fill in the constants **AZURE_AI_ENDPOINT**, **AZURE_AI_API_VERSION**, and **AZURE_AI_API_KEY** with the details from your Azure AI Service.\n", "\n", "> āš ļø Important:\n", - "You must update the code below to use your preferred Azure authentication method.\n", + "Please update the code below to use your preferred Azure authentication method.\n", "Look for the `# IMPORTANT` comments in the code and modify those sections accordingly.\n", - "Skipping this step may cause the sample to not run correctly.\n", + "Skipping this step may cause the sample not to run correctly.\n", "\n", "> āš ļø Note: While using a subscription key is supported, it is strongly recommended to use a token provider with Azure Active Directory (AAD) for enhanced security in production environments." ] @@ -74,7 +74,7 @@ "\n", "# For authentication, you can use either token-based auth or subscription key; only one is required\n", "AZURE_AI_ENDPOINT = os.getenv(\"AZURE_AI_ENDPOINT\")\n", - "# IMPORTANT: Replace with your actual subscription key or set it in your \".env\" file if not using token authentication\n", + "# IMPORTANT: Please replace with your actual subscription key or set it in your \".env\" file if not using token authentication\n", "AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n", "API_VERSION = \"2025-11-01\"\n", "\n", @@ -113,20 +113,20 @@ "source": [ "## Configure Model Deployments for Prebuilt Analyzers\n", "\n", - "> **šŸ’” Note:** This step is only required **once per Azure Content Understanding resource**, unless the GPT deployment has been changed. You can skip this section if:\n", + "> **šŸ’” Note:** This step is required only **once per Azure Content Understanding resource**, unless the GPT deployment has been changed. You can skip this section if:\n", "> - This configuration has already been run once for your resource, or\n", - "> - Your administrator has already configured the model deployments for you\n", + "> - Your administrator has already configured the model deployments for you.\n", "\n", - "Before using prebuilt analyzers, you need to configure the default model deployment mappings. This tells Content Understanding which model deployments to use.\n", + "Before using prebuilt analyzers, please configure the default model deployment mappings. This tells Content Understanding which model deployments to use.\n", "\n", "**Model Requirements:**\n", - "- **GPT-4.1** - Required for most prebuilt analyzers (e.g., `prebuilt-invoice`, `prebuilt-receipt`, `prebuilt-idDocument`)\n", - "- **GPT-4.1-mini** - Required for RAG analyzers (e.g., `prebuilt-documentSearch`, `prebuilt-audioSearch`, `prebuilt-videoSearch`)\n", - "- **text-embedding-3-large** - Required for all prebuilt analyzers that use embeddings\n", + "- **GPT-4.1** - Required for most prebuilt analyzers (e.g., `prebuilt-invoice`, `prebuilt-receipt`, `prebuilt-idDocument`).\n", + "- **GPT-4.1-mini** - Required for RAG analyzers (e.g., `prebuilt-documentSearch`, `prebuilt-audioSearch`, `prebuilt-videoSearch`).\n", + "- **text-embedding-3-large** - Required for all prebuilt analyzers that use embeddings.\n", "\n", "**Prerequisites:**\n", - "1. Deploy **GPT-4.1**, **GPT-4.1-mini**, and **text-embedding-3-large** models in Azure AI Foundry (see README.md for instructions)\n", - "2. Set `GPT_4_1_DEPLOYMENT`, `GPT_4_1_MINI_DEPLOYMENT`, and `TEXT_EMBEDDING_3_LARGE_DEPLOYMENT` in your `.env` file with the deployment names" + "1. Deploy **GPT-4.1**, **GPT-4.1-mini**, and **text-embedding-3-large** models in Azure AI Foundry (see README.md for instructions).\n", + "2. Set `GPT_4_1_DEPLOYMENT`, `GPT_4_1_MINI_DEPLOYMENT`, and `TEXT_EMBEDDING_3_LARGE_DEPLOYMENT` in your `.env` file with the deployment names." ] }, { @@ -135,7 +135,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Get model deployment names from environment variables\n", "# Get model deployment names from environment variables\n", "GPT_4_1_DEPLOYMENT = os.getenv(\"GPT_4_1_DEPLOYMENT\")\n", "GPT_4_1_MINI_DEPLOYMENT = os.getenv(\"GPT_4_1_MINI_DEPLOYMENT\")\n", @@ -161,7 +160,7 @@ " print(\" GPT_4_1_DEPLOYMENT=\")\n", " print(\" GPT_4_1_MINI_DEPLOYMENT=\")\n", " print(\" TEXT_EMBEDDING_3_LARGE_DEPLOYMENT=\")\n", - " print(\" 3. Restart the kernel and run this cell again\")\n", + " print(\" 3. Restart the kernel and run this cell again.\")\n", "else:\n", " print(f\"šŸ“‹ Configuring default model deployments...\")\n", " print(f\" GPT-4.1 deployment: {GPT_4_1_DEPLOYMENT}\")\n", @@ -183,8 +182,8 @@ " except Exception as e:\n", " print(f\"āŒ Failed to configure defaults: {e}\")\n", " print(f\" This may happen if:\")\n", - " print(f\" - One or more deployment names don't exist in your Azure AI Foundry project\")\n", - " print(f\" - You don't have permission to update defaults\")\n", + " print(f\" - One or more deployment names don't exist in your Azure AI Foundry project.\")\n", + " print(f\" - You don't have permission to update defaults.\")\n", " raise" ] }, @@ -198,12 +197,12 @@ "\n", "Key capabilities include:\n", "1. **Content Analysis:** Extracts text (printed and handwritten), selection marks, barcodes (12+ types), mathematical formulas (LaTeX), hyperlinks, and annotations.\n", - "2. **Figure Analysis:** Generates descriptions for images/charts/diagrams, converts charts to Chart.js syntax, and diagrams to Mermaid.js syntax.\n", + "2. **Figure Analysis:** Generates descriptions for images, charts, and diagrams; converts charts to Chart.js syntax; and diagrams to Mermaid.js syntax.\n", "3. **Structure Analysis:** Identifies paragraphs with contextual roles (title, section heading, page header/footer), detects tables with complex layouts (merged cells, multi-page), and maps hierarchical sections.\n", "4. **GitHub Flavored Markdown:** Outputs richly formatted markdown that preserves document structure for LLM comprehension and AI-powered analysis.\n", "5. **Broad Format Support:** Processes PDFs, images, Office documents (Word, Excel, PowerPoint), text files (HTML, Markdown), structured files (XML, JSON, CSV), and email formats (EML, MSG).\n", "\n", - "For detailed information about document elements and markdown representation, see [Document elements](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/elements) and [Document markdown](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/markdown).\n", + "For detailed information about document elements and markdown representation, please see [Document elements](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/elements) and [Document markdown](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/markdown).\n", "\n", "> **Note:** Figure analysis (descriptions and chart/diagram analysis) is only supported for PDF and image file formats." ] @@ -263,11 +262,11 @@ " print(f\" Table {table_counter}: {row_count} rows x {col_count} columns\")\n", " table_counter += 1\n", "else:\n", - " print(\"\\nšŸ“š Document Information: Not available for this content type\")\n", + " print(\"\\nšŸ“š Document Information: Not available for this content type.\")\n", " \n", "# Save the result\n", "saved_json_path = save_json_to_file(result, filename_prefix=\"content_analyzers_analyze_binary\")\n", - "print(f\"\\nšŸ“‹ Full analysis result saved. Review the complete JSON at: {saved_json_path}\")" + "print(f\"\\nšŸ“‹ Full analysis result saved. Please review the complete JSON at: {saved_json_path}\")" ] }, { @@ -336,11 +335,11 @@ " print(f\" Table {table_counter}: {row_count} rows x {col_count} columns\")\n", " table_counter += 1\n", "else:\n", - " print(\"\\nšŸ“š Document Information: Not available for this content type\")\n", + " print(\"\\nšŸ“š Document Information: Not available for this content type.\")\n", " \n", "# Save the result\n", "saved_json_path = save_json_to_file(result, filename_prefix=\"content_analyzers_url_document\")\n", - "print(f\"\\nšŸ“‹ Full analysis result saved. Review the complete JSON at: {saved_json_path}\")" + "print(f\"\\nšŸ“‹ Full analysis result saved. Please review the complete JSON at: {saved_json_path}\")" ] }, { @@ -353,12 +352,12 @@ "Key features include:\n", "1. **Transcription:** Converts conversational audio into searchable text with sentence-level and word-level timestamps.\n", "2. **Speaker Diarization:** Distinguishes between speakers in a conversation, attributing parts of the transcript to specific speakers (e.g., \"Speaker 1\", \"Speaker 2\").\n", - "3. **Timing Information:** Precise timing data in milliseconds (startTimeMs, endTimeMs) for each phrase, crucial for audio-text synchronization.\n", + "3. **Timing Information:** Provides precise timing data in milliseconds (startTimeMs, endTimeMs) for each phrase, essential for audio-text synchronization.\n", "4. **Summary Generation:** Automatically generates a summary of the conversation for quick understanding.\n", "5. **Multilingual Support:** Supports automatic language detection and multilingual transcription across multiple locales.\n", - "6. **Markdown Output:** Structured markdown format with WebVTT transcripts preserving speaker identification and timing.\n", + "6. **Markdown Output:** Provides a structured markdown format with WebVTT transcripts preserving speaker identification and timing.\n", "\n", - "For detailed information about audio markdown format and capabilities, see [Audio overview](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/audio/overview) and [AudioVisual markdown representation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/video/markdown)." + "For detailed information about audio markdown format and capabilities, please see [Audio overview](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/audio/overview) and [AudioVisual markdown representation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/video/markdown)." ] }, { @@ -438,7 +437,7 @@ "\n", "# Save the result\n", "saved_json_path = save_json_to_file(analysis_result, filename_prefix=\"content_analyzers_audio\")\n", - "print(f\"\\nšŸ“‹ Full analysis result saved. Review the complete JSON at: {saved_json_path}\")\n" + "print(f\"\\nšŸ“‹ Full analysis result saved. Please review the complete JSON at: {saved_json_path}\")\n" ] }, { @@ -446,17 +445,17 @@ "metadata": {}, "source": [ "## Video Content\n", - "The `prebuilt-videoSearch` analyzer provides comprehensive analysis of video content, combining visual frame extraction, audio transcription, and AI-powered insights. It transforms raw video into RAG-ready structured output in both Markdown and JSON formats, enabling applications like media asset management, content categorization, and retrieval-augmented generation.\n", + "The `prebuilt-videoSearch` analyzer provides comprehensive analysis of video content, combining visual frame extraction, audio transcription, and AI-powered insights. It transforms raw video into RAG-ready structured output in both Markdown and JSON formats, enabling applications such as media asset management, content categorization, and retrieval-augmented generation.\n", "\n", "Key features include:\n", "1. **Transcription with Diarization:** Converts audio to searchable WebVTT transcripts with speaker identification and multilingual support (same language handling as audio).\n", "2. **Key Frame Extraction:** Intelligently extracts representative frames (~1 FPS) from each scene, embedded as markdown image references with timestamps (e.g., `![](keyFrame.400.jpg)`).\n", "3. **Shot Detection:** Identifies video segment boundaries aligned with camera cuts and scene transitions, providing timestamps in `cameraShotTimesMs`.\n", "4. **Segment-based Analysis:** Analyzes multiple frames per segment to identify actions, events, and themes rather than individual frame analysis.\n", - "5. **Custom Field Extraction:** Define business-specific fields (brands, categories, sentiment) that the generative model extracts from visual and audio content.\n", + "5. **Custom Field Extraction:** Defines business-specific fields (brands, categories, sentiment) that the generative model extracts from visual and audio content.\n", "6. **Structured Output:** Content organized in GitHub Flavored Markdown with precise temporal alignment and JSON with detailed metadata.\n", "\n", - "For detailed information about video capabilities, elements, and markdown format, see [Video overview](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/video/overview), [Video elements](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/video/elements), and [AudioVisual markdown representation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/video/markdown)." + "For detailed information about video capabilities, elements, and markdown format, please see [Video overview](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/video/overview), [Video elements](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/video/elements), and [AudioVisual markdown representation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/video/markdown)." ] }, { @@ -481,18 +480,18 @@ " \"\"\"Save keyframe image to output file using pytest naming convention.\n", "\n", " Args:\n", - " image_content: The binary image content to save\n", - " keyframe_id: The keyframe ID (e.g., \"keyframes/733\")\n", - " test_name: Name of the test case (e.g., function name)\n", - " test_py_file_dir: Directory where pytest files are located\n", - " identifier: Optional unique identifier to avoid conflicts (e.g., analyzer_id)\n", - " output_dir: Directory name to save the output file (default: \"test_output\")\n", + " image_content: The binary image content to save.\n", + " keyframe_id: The keyframe ID (e.g., \"keyframes/733\").\n", + " test_name: Name of the test case (e.g., function name).\n", + " test_py_file_dir: Directory where pytest files are located.\n", + " identifier: Optional unique identifier to avoid conflicts (e.g., analyzer_id).\n", + " output_dir: Directory name to save the output file (default: \"test_output\").\n", "\n", " Returns:\n", - " str: Path to the saved image file\n", + " str: Path to the saved image file.\n", "\n", " Raises:\n", - " OSError: If there are issues creating directory or writing file\n", + " OSError: If there are issues creating directory or writing file.\n", " \"\"\"\n", " # Generate timestamp and frame ID\n", " timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", @@ -503,7 +502,7 @@ " # Fallback: use as-is if no slash found\n", " frame_id = keyframe_id\n", "\n", - " # Create output directory if it doesn't exist\n", + " # Create output directory if it does not exist\n", " output_dir_path = os.path.join(test_py_file_dir, output_dir)\n", " os.makedirs(output_dir_path, exist_ok=True)\n", "\n", @@ -605,7 +604,7 @@ "\n", "# Save the result\n", "saved_json_path = save_json_to_file(analysis_result, filename_prefix=\"content_analyzers_video\")\n", - "print(f\"\\nšŸ“‹ Full analysis result saved. Review the complete JSON at: {saved_json_path}\")\n", + "print(f\"\\nšŸ“‹ Full analysis result saved. Please review the complete JSON at: {saved_json_path}\")\n", "\n", "# Keyframe Processing\n", "def extract_keyframe_ids(analysis_result: Dict[str, Any]) -> list[str]:\n", @@ -613,9 +612,9 @@ " Extract all keyframe IDs from the analysis result.\n", "\n", " Args:\n", - " analysis_result: The analysis result from the analyzer\n", + " analysis_result: The analysis result from the analyzer.\n", " Returns:\n", - " List of keyframe IDs (e.g., 'keyframes/1000', 'keyframes/2000')\n", + " List of keyframe IDs (e.g., 'keyframes/1000', 'keyframes/2000').\n", " \"\"\"\n", " print(\"Starting keyframe extraction from analysis result...\")\n", " keyframe_ids = []\n", @@ -675,21 +674,21 @@ "source": [ "## Congratulations!\n", "\n", - "You've successfully learned how to extract content from multimodal files using Azure Content Understanding! You explored:\n", + "You've successfully learned how to extract content from multimodal files using Azure Content Understanding! In this notebook, you explored:\n", "\n", - "- **Document extraction** with the `prebuilt-documentSearch` analyzer\n", - "- **Audio transcription** with speaker diarization using `prebuilt-audioSearch`\n", - "- **Video analysis** with keyframe extraction using `prebuilt-videoSearch`\n", + "- **Document extraction** with the `prebuilt-documentSearch` analyzer.\n", + "- **Audio transcription** with speaker diarization using `prebuilt-audioSearch`.\n", + "- **Video analysis** with keyframe extraction using `prebuilt-videoSearch`.\n", "\n", "### Learn More\n", "\n", - "To dive deeper into Azure Content Understanding capabilities:\n", + "To dive deeper into Azure Content Understanding capabilities, please explore:\n", "\n", - "- **[Content Understanding Overview](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/overview)** - Comprehensive introduction to the service\n", - "- **[What's New](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/whats-new)** - Latest features and updates\n", - "- **[Content Extraction Guide](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/overview#content-extraction)** - Detailed documentation on extraction capabilities\n", + "- **[Content Understanding Overview](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/overview)** - Comprehensive introduction to the service.\n", + "- **[What's New](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/whats-new)** - Latest features and updates.\n", + "- **[Content Extraction Guide](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/overview#content-extraction)** - Detailed documentation on extraction capabilities.\n", "\n", - "Explore other notebooks in this repository to learn about custom analyzers, field extraction, and advanced scenarios!" + "Feel free to explore other notebooks in this repository to learn about custom analyzers, field extraction, and advanced scenarios!" ] } ], @@ -714,4 +713,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file