diff --git a/notebooks/classifier.ipynb b/notebooks/classifier.ipynb index ef75327..8da0234 100644 --- a/notebooks/classifier.ipynb +++ b/notebooks/classifier.ipynb @@ -7,9 +7,9 @@ "# Azure AI Content Understanding - Classifier and Analyzer Demo\n", "\n", "This notebook demonstrates how to use the Azure AI Content Understanding service to:\n", - "1. Create a classifier for document categorization\n", - "2. Create a custom analyzer to extract specific fields\n", - "3. Combine the classifier and analyzers to classify, optionally split, and analyze documents within a flexible processing pipeline\n", + "1. Create a classifier for document categorization.\n", + "2. Create a custom analyzer to extract specific fields.\n", + "3. Combine the classifier and analyzers to classify, optionally split, and analyze documents within a flexible processing pipeline.\n", "\n", "For more detailed information before getting started, please refer to the official documentation:\n", "[Understanding Classifiers in Azure AI Services](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/classifier)\n", @@ -36,7 +36,7 @@ "\n", "> The [AzureContentUnderstandingClient](../python/content_understanding_client.py) is a utility class that provides functions to interact with the Content Understanding API. Prior to the official release of the Content Understanding SDK, it serves as a lightweight SDK.\n", ">\n", - "> Fill in the constants **AZURE_AI_ENDPOINT**, **AZURE_AI_API_VERSION**, and **AZURE_AI_API_KEY** with the details from your Azure AI Service.\n", + "> Please fill in the constants **AZURE_AI_ENDPOINT**, **AZURE_AI_API_VERSION**, and **AZURE_AI_API_KEY** with the details from your Azure AI Service.\n", "\n", "> ⚠️ Important:\n", "You must update the code below to use your preferred Azure authentication method.\n", @@ -110,20 +110,20 @@ "source": [ "## Configure Model Deployments for Prebuilt Analyzers\n", "\n", - "> **💡 Note:** This step is only required **once per Azure Content Understanding resource**, unless the GPT deployment has been changed. You can skip this section if:\n", - "> - This configuration has already been run once for your resource, or\n", - "> - Your administrator has already configured the model deployments for you\n", + "> **💡 Note:** This step is required **only once per Azure Content Understanding resource**, unless the GPT deployment has been changed. You may skip this section if:\n", + "> - This configuration has already been completed for your resource, or\n", + "> - Your administrator has already set up the model deployments for you.\n", "\n", "Before using prebuilt analyzers, you need to configure the default model deployment mappings. This tells Content Understanding which model deployments to use.\n", "\n", "**Model Requirements:**\n", - "- **GPT-4.1** - Required for most prebuilt analyzers (e.g., `prebuilt-invoice`, `prebuilt-receipt`, `prebuilt-idDocument`)\n", - "- **GPT-4.1-mini** - Required for RAG analyzers (e.g., `prebuilt-documentSearch`, `prebuilt-audioSearch`, `prebuilt-videoSearch`)\n", - "- **text-embedding-3-large** - Required for all prebuilt analyzers that use embeddings\n", + "- **GPT-4.1** - Required for most prebuilt analyzers (e.g., `prebuilt-invoice`, `prebuilt-receipt`, `prebuilt-idDocument`).\n", + "- **GPT-4.1-mini** - Required for RAG analyzers (e.g., `prebuilt-documentSearch`, `prebuilt-audioSearch`, `prebuilt-videoSearch`).\n", + "- **text-embedding-3-large** - Required for all prebuilt analyzers that use embeddings.\n", "\n", "**Prerequisites:**\n", - "1. Deploy **GPT-4.1**, **GPT-4.1-mini**, and **text-embedding-3-large** models in Azure AI Foundry\n", - "2. Set `GPT_4_1_DEPLOYMENT`, `GPT_4_1_MINI_DEPLOYMENT`, and `TEXT_EMBEDDING_3_LARGE_DEPLOYMENT` in your `.env` file with the deployment names" + "1. Deploy **GPT-4.1**, **GPT-4.1-mini**, and **text-embedding-3-large** models in Azure AI Foundry.\n", + "2. Set `GPT_4_1_DEPLOYMENT`, `GPT_4_1_MINI_DEPLOYMENT`, and `TEXT_EMBEDDING_3_LARGE_DEPLOYMENT` in your `.env` file with the deployment names." ] }, { @@ -152,12 +152,12 @@ " print(f\" - {deployment}\")\n", " print(\"\\n Prebuilt analyzers require GPT-4.1, GPT-4.1-mini, and text-embedding-3-large deployments.\")\n", " print(\" Please:\")\n", - " print(\" 1. Deploy all three models in Azure AI Foundry\")\n", + " print(\" 1. Deploy all three models in Azure AI Foundry.\")\n", " print(\" 2. Add the following to notebooks/.env:\")\n", " print(\" GPT_4_1_DEPLOYMENT=\")\n", " print(\" GPT_4_1_MINI_DEPLOYMENT=\")\n", " print(\" TEXT_EMBEDDING_3_LARGE_DEPLOYMENT=\")\n", - " print(\" 3. Restart the kernel and run this cell again\")\n", + " print(\" 3. Restart the kernel and run this cell again.\")\n", "else:\n", " print(f\"📋 Configuring default model deployments...\")\n", " print(f\" GPT-4.1 deployment: {GPT_4_1_DEPLOYMENT}\")\n", @@ -179,8 +179,8 @@ " except Exception as e:\n", " print(f\"❌ Failed to configure defaults: {e}\")\n", " print(f\" This may happen if:\")\n", - " print(f\" - One or more deployment names don't exist in your Azure AI Foundry project\")\n", - " print(f\" - You don't have permission to update defaults\")\n", + " print(f\" - One or more deployment names don't exist in your Azure AI Foundry project.\")\n", + " print(f\" - You don't have permission to update defaults.\")\n", " raise\n" ] }, @@ -189,19 +189,19 @@ "metadata": {}, "source": [ "## Create a Basic Classifier\n", - "Classify document from URL using begin_classify API.\n", + "Classify a document from a URL using the `begin_classify` API.\n", "\n", "High-level steps:\n", - "1. Create a custom classifier\n", - "2. Classify a document from a remote URL\n", - "3. Save the classification result to a file\n", - "4. Clean up the created classifier\n", + "1. Create a custom classifier.\n", + "2. Classify a document from a remote URL.\n", + "3. Save the classification result to a file.\n", + "4. Clean up the created classifier.\n", "\n", - "In Azure AI Content Understanding, classification is integrated directly into the analyzer operation rather than requiring a separate API. To create a classifier, you define **`contentCategories`** within the analyzer's configuration, specifying up to 200 category names and descriptions that the service will use to categorize your input files. \n", + "In Azure AI Content Understanding, classification is integrated directly into the analyzer operation rather than requiring a separate API. To create a classifier, you define **`contentCategories`** within the analyzer's configuration, specifying up to 200 category names and descriptions that the service will use to categorize your input files.\n", "\n", - "The **`enableSegment`** parameter controls how the classifier handles multi-document files: when set to `true`, it automatically splits and classifies different document types within a single file (useful for processing combined documents like a loan application package containing multiple forms), while setting it to `false` treats the entire file as a single document. \n", + "The **`enableSegment`** parameter controls how the classifier handles multi-document files: when set to `true`, it automatically splits and classifies different document types within a single file (useful for processing combined documents like a loan application package containing multiple forms). When set to `false`, it treats the entire file as a single document.\n", "\n", - "For more detailed information about classification capabilities, best practices, and advanced scenarios, see the [Content Understanding classification documentation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/classifier)." + "For more detailed information about classification capabilities, best practices, and advanced scenarios, please see the [Content Understanding classification documentation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/classifier)." ] }, { @@ -319,9 +319,9 @@ " print(f\" Segment ID: {segment.get('segmentId', 'N/A')}\")\n", " print(\"=\" * 50)\n", " else:\n", - " print(\"No contents available in analysis result\")\n", + " print(\"No contents available in analysis result.\")\n", "else:\n", - " print(\"No analysis result available\")" + " print(\"No analysis result available.\")" ] }, { @@ -348,7 +348,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Clean up the created analyzer \n", + "## Clean up the created analyzer\n", "After the demo completes, the classifier is automatically deleted to prevent resource accumulation." ] }, @@ -386,7 +386,7 @@ "# Define custom analyzer as a dictionary\n", "custom_analyzer = {\n", " \"baseAnalyzerId\": \"prebuilt-document\",\n", - " \"description\": \"Loan application analyzer - extracts key information from loan applications\",\n", + " \"description\": \"Loan application analyzer - extracts key information from loan applications.\",\n", " \"config\": {\n", " \"returnDetails\": True,\n", " \"enableLayout\": True,\n", @@ -447,7 +447,7 @@ "source": [ "## Create an Enhanced Classifier with Custom Analyzer\n", "\n", - "Now create a new classifier that uses the prebuilt invoice analyzer for invoices and the custom analyzer for loan application documents.\n", + "Now, create a new classifier that uses the prebuilt invoice analyzer for invoices and the custom analyzer for loan application documents.\n", "This combines document classification with field extraction in one operation." ] }, @@ -573,7 +573,6 @@ " else:\n", " print(f\" (No custom fields extracted for this category)\")\n", " \n", - " \n", " print(\"\\n\" + \"=\" * 80)\n", " \n", " # Display document information for the first segment\n", @@ -586,9 +585,9 @@ " unit = first_content.get(\"unit\", \"units\")\n", " print(f\"Page dimensions: {pages[0].get('width')} x {pages[0].get('height')} {unit}\")\n", " else:\n", - " print(\"No contents available in enhanced analysis result\")\n", + " print(\"No contents available in enhanced analysis result.\")\n", "else:\n", - " print(\"No enhanced analysis result available\")" + " print(\"No enhanced analysis result available.\")" ] }, { @@ -673,4 +672,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file