Azure-Samples · changjian-wang · Aug 22, 2025 · Aug 25, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -164,3 +164,4 @@ cython_debug/
 # VSCode
 .vscode
 .azure
+test_output/
diff --git a/...ollment_data/Jordan/Family1-Daughter3.jpg → ...nrollment_data/Mary/Family1-Daughter3.jpg b/...ollment_data/Jordan/Family1-Daughter3.jpg → ...nrollment_data/Mary/Family1-Daughter3.jpg
diff --git a/...ace/enrollment_data/Bill/Family1-Dad3.jpg → data/face/new_face_image.jpg b/...ace/enrollment_data/Bill/Family1-Dad3.jpg → data/face/new_face_image.jpg
diff --git a/notebooks/.env.sample b/notebooks/.env.sample
@@ -1 +1,37 @@
-AZURE_AI_ENDPOINT=
+# Azure Content Understanding Service Configuration
+# Copy this file to <repository-root>/.env and update with your actual values
+
+# Your Azure Content Understanding service endpoint
+# Example: https://your-resource-name.services.ai.azure.com/
+# If you need help to create one, please see the Prerequisites section in:
+#   https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-rest-api?tabs=document#prerequisites
+# As of 2025/05, 2025-05-01-preview is only available in the regions documented in 
+#   Content Understanding region and language support (https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/language-region-support).
+
+# Azure Content Understanding Test Configuration
+
+# Required for Content Understanding SDK and testing
+AZURE_CONTENT_UNDERSTANDING_ENDPOINT=https://your-resource-name.services.ai.azure.com/
+
+# Authentication Options:
+# Option 1: Use Azure Key (FOR TESTING ONLY - Less secure)
+# Set this value if you want to use key-based authentication
+# WARNING: Keys are less secure and should only be used for testing/development
+# Leave empty to use DefaultAzureCredential (recommended)
+AZURE_CONTENT_UNDERSTANDING_KEY=
+
+# Option 2: Use DefaultAzureCredential (RECOMMENDED for production and development)
+# If AZURE_CONTENT_UNDERSTANDING_KEY is empty, the script will use DefaultAzureCredential
+# 
+# Most common development scenario:
+#   1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
+#   2. Login: az login
+#   3. Run the script (no additional configuration needed)
+#
+# This also supports:
+#   - Environment variables (AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID)
+#   - Managed Identity (for Azure-hosted applications)
+#   - Visual Studio Code authentication
+#   - Azure PowerShell authentication
+# For more info: https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme#defaultazurecredential
+
diff --git a/notebooks/analyzer_training.ipynb b/notebooks/analyzer_training.ipynb
@@ -57,7 +57,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "analyzer_template = \"../analyzer_templates/receipt.json\"\n",
     "training_docs_folder = \"../data/document_training\""
    ]
   },
@@ -88,30 +87,44 @@
     "import json\n",
     "import os\n",
     "import sys\n",
-    "from pathlib import Path\n",
-    "from dotenv import find_dotenv, load_dotenv\n",
-    "from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n",
+    "import uuid\n",
+    "from dotenv import load_dotenv\n",
+    "from azure.storage.blob import ContainerSasPermissions\n",
+    "from azure.core.credentials import AzureKeyCredential\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "from azure.ai.contentunderstanding.aio import ContentUnderstandingClient\n",
+    "from azure.ai.contentunderstanding.models import (\n",
+    "    ContentAnalyzer,\n",
+    "    FieldSchema,\n",
+    "    FieldDefinition,\n",
+    "    FieldType,\n",
+    "    GenerationMethod,\n",
+    "    AnalysisMode,\n",
+    "    ProcessingLocation,\n",
+    ")\n",
     "\n",
-    "# Import utility package from the Python samples root directory\n",
-    "parent_dir = Path(Path.cwd()).parent\n",
-    "sys.path.append(str(parent_dir))\n",
-    "from python.content_understanding_client import AzureContentUnderstandingClient\n",
+    "# Add the parent directory to the Python path to import the sample_helper module\n",
+    "sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'python'))\n",
+    "from extension.document_processor import DocumentProcessor\n",
+    "from extension.sample_helper import extract_operation_id_from_poller, PollerType, save_json_to_file\n",
     "\n",
-    "load_dotenv(find_dotenv())\n",
+    "load_dotenv()\n",
     "logging.basicConfig(level=logging.INFO)\n",
     "\n",
-    "credential = DefaultAzureCredential()\n",
-    "token_provider = get_bearer_token_provider(credential, \"https://cognitiveservices.azure.com/.default\")\n",
-    "\n",
-    "client = AzureContentUnderstandingClient(\n",
-    "    endpoint=os.getenv(\"AZURE_AI_ENDPOINT\"),\n",
-    "    api_version=os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\"),\n",
-    "    # IMPORTANT: Comment out token_provider if using subscription key\n",
-    "    token_provider=token_provider,\n",
-    "    # IMPORTANT: Uncomment this if using subscription key\n",
-    "    # subscription_key=os.getenv(\"AZURE_AI_API_KEY\"),\n",
-    "    x_ms_useragent=\"azure-ai-content-understanding-python/analyzer_training\", # This header is used for sample usage telemetry; please comment out this line if you want to opt out.\n",
-    ")"
+    "endpoint = os.environ.get(\"AZURE_CONTENT_UNDERSTANDING_ENDPOINT\")\n",
+    "# Return AzureKeyCredential if AZURE_CONTENT_UNDERSTANDING_KEY is set, otherwise DefaultAzureCredential\n",
+    "key = os.getenv(\"AZURE_CONTENT_UNDERSTANDING_KEY\")\n",
+    "credential = AzureKeyCredential(key) if key else DefaultAzureCredential()\n",
+    "# Create the ContentUnderstandingClient\n",
+    "client = ContentUnderstandingClient(endpoint=endpoint, credential=credential)\n",
+    "print(\"✅ ContentUnderstandingClient created successfully\")\n",
+    "\n",
+    "try:\n",
+    "    processor = DocumentProcessor(client)\n",
+    "    print(\"✅ DocumentProcessor created successfully\")\n",
+    "except Exception as e:\n",
+    "    print(f\"❌ Failed to create DocumentProcessor: {e}\")\n",
+    "    raise"
    ]
   },
   {
@@ -133,26 +146,29 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Load reference storage configuration from environment\n",
+    "training_data_path = os.getenv(\"TRAINING_DATA_PATH\") or f\"training_data_{uuid.uuid4().hex[:8]}\"\n",
     "training_data_sas_url = os.getenv(\"TRAINING_DATA_SAS_URL\")\n",
+    "\n",
+    "if not training_data_path.endswith(\"/\"):\n",
+    "    training_data_path += \"/\"\n",
+    "\n",
     "if not training_data_sas_url:\n",
     "    TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCOUNT_NAME\")\n",
     "    TRAINING_DATA_CONTAINER_NAME = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n",
-    "    if not TRAINING_DATA_STORAGE_ACCOUNT_NAME and not training_data_sas_url:\n",
-    "        raise ValueError(\n",
-    "            \"Please set either TRAINING_DATA_SAS_URL or both TRAINING_DATA_STORAGE_ACCOUNT_NAME and TRAINING_DATA_CONTAINER_NAME environment variables.\"\n",
+    "    print(f\"TRAINING_DATA_STORAGE_ACCOUNT_NAME: {TRAINING_DATA_STORAGE_ACCOUNT_NAME}\")\n",
+    "    print(f\"TRAINING_DATA_CONTAINER_NAME: {TRAINING_DATA_CONTAINER_NAME}\")\n",
+    "\n",
+    "    if TRAINING_DATA_STORAGE_ACCOUNT_NAME and TRAINING_DATA_CONTAINER_NAME:\n",
+    "        # We require \"Write\" permission to upload, modify, or append blobs\n",
+    "        training_data_sas_url = processor.generate_container_sas_url(\n",
+    "            account_name=TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n",
+    "            container_name=TRAINING_DATA_CONTAINER_NAME,\n",
+    "            permissions=ContainerSasPermissions(read=True, write=True, list=True),\n",
+    "            expiry_hours=1,\n",
     "        )\n",
-    "    from azure.storage.blob import ContainerSasPermissions\n",
-    "    # Requires \"Write\" (critical for upload/modify/append) along with \"Read\" and \"List\" for viewing/listing blobs.\n",
-    "    training_data_sas_url = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n",
-    "        account_name=TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n",
-    "        container_name=TRAINING_DATA_CONTAINER_NAME,\n",
-    "        permissions=ContainerSasPermissions(read=True, write=True, list=True),\n",
-    "        expiry_hours=1,\n",
-    "    )\n",
-    "\n",
-    "training_data_path = os.getenv(\"TRAINING_DATA_PATH\")\n",
-    "\n",
-    "await client.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)"
+    "\n",
+    "await processor.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)"
    ]
   },
   {
@@ -162,7 +178,7 @@
     "## Create Analyzer with Defined Schema\n",
     "Before creating the analyzer, fill in the constant `ANALYZER_ID` with a relevant name for your task. In this example, we generate a unique suffix so that this cell can be run multiple times to create different analyzers.\n",
     "\n",
-    "We use **training_data_sas_url** and **training_data_path** as set in the [.env](./.env) file and used in the previous step."
+    "We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** as set in the [.env](./.env) file and used in the previous step."
    ]
   },
   {
@@ -171,24 +187,80 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import uuid\n",
-    "CUSTOM_ANALYZER_ID = \"train-sample-\" + str(uuid.uuid4())\n",
+    "import datetime\n",
+    "\n",
+    "analyzer_id = f\"analyzer-training-sample-{datetime.now().strftime('%Y%m%d')}-{datetime.now().strftime('%H%M%S')}-{uuid.uuid4().hex[:8]}\"\n",
+    "\n",
+    "content_analyzer = ContentAnalyzer(\n",
+    "    base_analyzer_id=\"prebuilt-documentAnalyzer\",\n",
+    "    description=\"Extract useful information from receipt\",\n",
+    "    field_schema=FieldSchema(\n",
+    "        name=\"receipt schema\",\n",
+    "        description=\"Schema for receipt\",\n",
+    "        fields={\n",
+    "            \"MerchantName\": FieldDefinition(\n",
+    "                type=FieldType.STRING,\n",
+    "                method=GenerationMethod.EXTRACT,\n",
+    "                description=\"\"\n",
+    "            ),\n",
+    "            \"Items\": FieldDefinition(\n",
+    "                type=FieldType.ARRAY,\n",
+    "                method=GenerationMethod.GENERATE,\n",
+    "                description=\"\",\n",
+    "                items_property={\n",
+    "                    \"type\": \"object\",\n",
+    "                    \"method\": \"extract\",\n",
+    "                    \"properties\": {\n",
+    "                        \"Quantity\": {\n",
+    "                            \"type\": \"string\",\n",
+    "                            \"method\": \"extract\",\n",
+    "                            \"description\": \"\"\n",
+    "                        },\n",
+    "                        \"Name\": {\n",
+    "                            \"type\": \"string\",\n",
+    "                            \"method\": \"extract\",\n",
+    "                            \"description\": \"\"\n",
+    "                        },\n",
+    "                        \"Price\": {\n",
+    "                            \"type\": \"string\",\n",
+    "                            \"method\": \"extract\",\n",
+    "                            \"description\": \"\"\n",
+    "                        }\n",
+    "                    }\n",
+    "                }\n",
+    "            ),\n",
+    "            \"TotalPrice\": FieldDefinition(\n",
+    "                type=FieldType.STRING,\n",
+    "                method=GenerationMethod.EXTRACT,\n",
+    "                description=\"\"\n",
+    "            )\n",
+    "        }\n",
+    "    ),\n",
+    "    mode=AnalysisMode.STANDARD,\n",
+    "    processing_location=ProcessingLocation.GEOGRAPHY,\n",
+    "    tags={\"demo_type\": \"get_result\"},\n",
+    "    training_data={\n",
+    "        \"kind\": \"blob\",\n",
+    "        \"containerUrl\": training_data_sas_url,\n",
+    "        \"prefix\": training_data_path\n",
+    "    },\n",
+    ")\n",
+    "print(f\"🔧 Creating custom analyzer '{analyzer_id}'...\")\n",
+    "poller = await client.content_analyzers.begin_create_or_replace(\n",
+    "    analyzer_id=analyzer_id,\n",
+    "    resource=content_analyzer,\n",
+    ")\n",
     "\n",
-    "response = client.begin_create_analyzer(\n",
-    "    CUSTOM_ANALYZER_ID,\n",
-    "    analyzer_template_path=analyzer_template,\n",
-    "    training_storage_container_sas_url=training_data_sas_url,\n",
-    "    training_storage_container_path_prefix=training_data_path,\n",
+    "# Extract operation ID from the poller\n",
+    "operation_id = extract_operation_id_from_poller(\n",
+    "    poller, PollerType.ANALYZER_CREATION\n",
     ")\n",
-    "result = client.poll_result(response)\n",
-    "if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n",
-    "    logging.info(f\"Analyzer details for {result['result']['analyzerId']}\")\n",
-    "    logging.info(json.dumps(result, indent=2))\n",
-    "else:\n",
-    "    logging.warning(\n",
-    "        \"An issue was encountered when trying to create the analyzer. \"\n",
-    "        \"Please double-check your deployment and configurations for potential problems.\"\n",
-    "    )"
+    "print(f\"📋 Extracted creation operation ID: {operation_id}\")\n",
+    "\n",
+    "# Wait for the analyzer to be created\n",
+    "print(f\"⏳ Waiting for analyzer creation to complete...\")\n",
+    "await poller.result()\n",
+    "print(f\"✅ Analyzer '{analyzer_id}' created successfully!\")"
    ]
   },
   {
@@ -205,10 +277,53 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "response = client.begin_analyze(CUSTOM_ANALYZER_ID, file_location='../data/receipt.png')\n",
-    "result_json = client.poll_result(response)\n",
+    "file_path = \"../data/receipt.png\"\n",
+    "print(f\"📄 Reading document file: {file_path}\")\n",
+    "with open(file_path, \"rb\") as f:\n",
+    "    data_content = f.read()\n",
+    "\n",
+    "# Begin document analysis operation\n",
+    "print(f\"🔍 Starting document analysis with analyzer '{analyzer_id}'...\")\n",
+    "analysis_poller = await client.content_analyzers.begin_analyze_binary(\n",
+    "    analyzer_id=analyzer_id, \n",
+    "    input=data_content,\n",
+    "    content_type=\"application/octet-stream\")\n",
+    "\n",
+    "# Wait for analysis completion\n",
+    "print(f\"⏳ Waiting for document analysis to complete...\")\n",
+    "analysis_result = await analysis_poller.result()\n",
+    "print(f\"✅ Document analysis completed successfully!\")\n",
+    "\n",
+    " # Extract operation ID for get_result\n",
+    "analysis_operation_id = extract_operation_id_from_poller(\n",
+    "    analysis_poller, PollerType.ANALYZE_CALL\n",
+    ")\n",
+    "print(f\"📋 Extracted analysis operation ID: {analysis_operation_id}\")\n",
+    "\n",
+    "# Get the analysis result using the operation ID\n",
+    "print(\n",
+    "    f\"🔍 Getting analysis result using operation ID '{analysis_operation_id}'...\"\n",
+    ")\n",
+    "operation_status = await client.content_analyzers.get_result(\n",
+    "    operation_id=analysis_operation_id,\n",
+    ")\n",
+    "\n",
+    "print(f\"✅ Analysis result retrieved successfully!\")\n",
+    "print(f\"   Operation ID: {operation_status.id}\")\n",
+    "print(f\"   Status: {operation_status.status}\")\n",
     "\n",
-    "logging.info(json.dumps(result_json, indent=2))"
+    "# The actual analysis result is in operation_status.result\n",
+    "operation_result = operation_status.result\n",
+    "if operation_result is None:\n",
+    "    print(\"⚠️  No analysis result available\")\n",
+    "\n",
+    "print(f\"📄 Analysis Result: {json.dumps(operation_result.as_dict())}\")\n",
+    "\n",
+    "# Save the analysis result to a file\n",
+    "saved_file_path = save_json_to_file(\n",
+    "    result=operation_result.as_dict(),\n",
+    "    filename_prefix=\"analyzer_training_get_result\",\n",
+    ")"
    ]
   },
   {
@@ -225,13 +340,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "client.delete_analyzer(CUSTOM_ANALYZER_ID)"
+    "client.content_analyzers.delete(analyzer_id)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "py312",
    "language": "python",
    "name": "python3"
   },
@@ -245,7 +360,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.12"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
-Original file line number
+Diff line change
@@ Expand Up / @@ -164,3 +164,4 @@ cython_debug/ @@
     # VSCode
     .vscode
     .azure
+    test_output/