-
Notifications
You must be signed in to change notification settings - Fork 41
Update samples from sdk #100
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
1fd4a17
eeea7c1
3d9224d
c8e8872
0b10574
55bbce6
359a882
bbcd43e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -164,3 +164,4 @@ cython_debug/ | |
| # VSCode | ||
| .vscode | ||
| .azure | ||
| test_output/ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1,37 @@ | ||
| AZURE_AI_ENDPOINT= | ||
| # Azure Content Understanding Service Configuration | ||
| # Copy this file to <repository-root>/.env and update with your actual values | ||
|
|
||
| # Your Azure Content Understanding service endpoint | ||
| # Example: https://your-resource-name.services.ai.azure.com/ | ||
| # If you need help to create one, please see the Prerequisites section in: | ||
| # https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-rest-api?tabs=document#prerequisites | ||
| # As of 2025/05, 2025-05-01-preview is only available in the regions documented in | ||
| # Content Understanding region and language support (https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/language-region-support). | ||
|
|
||
| # Azure Content Understanding Test Configuration | ||
|
|
||
| # Required for Content Understanding SDK and testing | ||
| AZURE_CONTENT_UNDERSTANDING_ENDPOINT=https://your-resource-name.services.ai.azure.com/ | ||
|
|
||
| # Authentication Options: | ||
| # Option 1: Use Azure Key (FOR TESTING ONLY - Less secure) | ||
| # Set this value if you want to use key-based authentication | ||
| # WARNING: Keys are less secure and should only be used for testing/development | ||
| # Leave empty to use DefaultAzureCredential (recommended) | ||
| AZURE_CONTENT_UNDERSTANDING_KEY= | ||
|
|
||
| # Option 2: Use DefaultAzureCredential (RECOMMENDED for production and development) | ||
| # If AZURE_CONTENT_UNDERSTANDING_KEY is empty, the script will use DefaultAzureCredential | ||
| # | ||
| # Most common development scenario: | ||
| # 1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli | ||
| # 2. Login: az login | ||
| # 3. Run the script (no additional configuration needed) | ||
| # | ||
| # This also supports: | ||
| # - Environment variables (AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID) | ||
| # - Managed Identity (for Azure-hosted applications) | ||
| # - Visual Studio Code authentication | ||
| # - Azure PowerShell authentication | ||
| # For more info: https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme#defaultazurecredential | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,7 +57,6 @@ | |
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "analyzer_template = \"../analyzer_templates/receipt.json\"\n", | ||
| "training_docs_folder = \"../data/document_training\"" | ||
| ] | ||
| }, | ||
|
|
@@ -88,30 +87,44 @@ | |
| "import json\n", | ||
| "import os\n", | ||
| "import sys\n", | ||
| "from pathlib import Path\n", | ||
| "from dotenv import find_dotenv, load_dotenv\n", | ||
| "from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n", | ||
| "import uuid\n", | ||
| "from dotenv import load_dotenv\n", | ||
| "from azure.storage.blob import ContainerSasPermissions\n", | ||
| "from azure.core.credentials import AzureKeyCredential\n", | ||
| "from azure.identity import DefaultAzureCredential\n", | ||
| "from azure.ai.contentunderstanding.aio import ContentUnderstandingClient\n", | ||
| "from azure.ai.contentunderstanding.models import (\n", | ||
| " ContentAnalyzer,\n", | ||
| " FieldSchema,\n", | ||
| " FieldDefinition,\n", | ||
| " FieldType,\n", | ||
| " GenerationMethod,\n", | ||
| " AnalysisMode,\n", | ||
| " ProcessingLocation,\n", | ||
| ")\n", | ||
| "\n", | ||
| "# Import utility package from the Python samples root directory\n", | ||
| "parent_dir = Path(Path.cwd()).parent\n", | ||
| "sys.path.append(str(parent_dir))\n", | ||
| "from python.content_understanding_client import AzureContentUnderstandingClient\n", | ||
| "# Add the parent directory to the Python path to import the sample_helper module\n", | ||
| "sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'python'))\n", | ||
| "from extension.document_processor import DocumentProcessor\n", | ||
| "from extension.sample_helper import extract_operation_id_from_poller, PollerType, save_json_to_file\n", | ||
| "\n", | ||
| "load_dotenv(find_dotenv())\n", | ||
| "load_dotenv()\n", | ||
| "logging.basicConfig(level=logging.INFO)\n", | ||
| "\n", | ||
| "credential = DefaultAzureCredential()\n", | ||
| "token_provider = get_bearer_token_provider(credential, \"https://cognitiveservices.azure.com/.default\")\n", | ||
| "\n", | ||
| "client = AzureContentUnderstandingClient(\n", | ||
| " endpoint=os.getenv(\"AZURE_AI_ENDPOINT\"),\n", | ||
| " api_version=os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\"),\n", | ||
| " # IMPORTANT: Comment out token_provider if using subscription key\n", | ||
| " token_provider=token_provider,\n", | ||
| " # IMPORTANT: Uncomment this if using subscription key\n", | ||
| " # subscription_key=os.getenv(\"AZURE_AI_API_KEY\"),\n", | ||
| " x_ms_useragent=\"azure-ai-content-understanding-python/analyzer_training\", # This header is used for sample usage telemetry; please comment out this line if you want to opt out.\n", | ||
| ")" | ||
| "endpoint = os.environ.get(\"AZURE_CONTENT_UNDERSTANDING_ENDPOINT\")\n", | ||
| "# Return AzureKeyCredential if AZURE_CONTENT_UNDERSTANDING_KEY is set, otherwise DefaultAzureCredential\n", | ||
| "key = os.getenv(\"AZURE_CONTENT_UNDERSTANDING_KEY\")\n", | ||
| "credential = AzureKeyCredential(key) if key else DefaultAzureCredential()\n", | ||
| "# Create the ContentUnderstandingClient\n", | ||
| "client = ContentUnderstandingClient(endpoint=endpoint, credential=credential)\n", | ||
| "print(\"✅ ContentUnderstandingClient created successfully\")\n", | ||
| "\n", | ||
| "try:\n", | ||
| " processor = DocumentProcessor(client)\n", | ||
| " print(\"✅ DocumentProcessor created successfully\")\n", | ||
| "except Exception as e:\n", | ||
| " print(f\"❌ Failed to create DocumentProcessor: {e}\")\n", | ||
| " raise" | ||
| ] | ||
| }, | ||
| { | ||
|
|
@@ -133,26 +146,29 @@ | |
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "# Load reference storage configuration from environment\n", | ||
| "training_data_path = os.getenv(\"TRAINING_DATA_PATH\") or f\"training_data_{uuid.uuid4().hex[:8]}\"\n", | ||
| "training_data_sas_url = os.getenv(\"TRAINING_DATA_SAS_URL\")\n", | ||
| "\n", | ||
| "if not training_data_path.endswith(\"/\"):\n", | ||
| " training_data_path += \"/\"\n", | ||
| "\n", | ||
| "if not training_data_sas_url:\n", | ||
| " TRAINING_DATA_STORAGE_ACCOUNT_NAME = os.getenv(\"TRAINING_DATA_STORAGE_ACCOUNT_NAME\")\n", | ||
| " TRAINING_DATA_CONTAINER_NAME = os.getenv(\"TRAINING_DATA_CONTAINER_NAME\")\n", | ||
| " if not TRAINING_DATA_STORAGE_ACCOUNT_NAME and not training_data_sas_url:\n", | ||
| " raise ValueError(\n", | ||
| " \"Please set either TRAINING_DATA_SAS_URL or both TRAINING_DATA_STORAGE_ACCOUNT_NAME and TRAINING_DATA_CONTAINER_NAME environment variables.\"\n", | ||
| " print(f\"TRAINING_DATA_STORAGE_ACCOUNT_NAME: {TRAINING_DATA_STORAGE_ACCOUNT_NAME}\")\n", | ||
| " print(f\"TRAINING_DATA_CONTAINER_NAME: {TRAINING_DATA_CONTAINER_NAME}\")\n", | ||
| "\n", | ||
| " if TRAINING_DATA_STORAGE_ACCOUNT_NAME and TRAINING_DATA_CONTAINER_NAME:\n", | ||
| " # We require \"Write\" permission to upload, modify, or append blobs\n", | ||
| " training_data_sas_url = processor.generate_container_sas_url(\n", | ||
| " account_name=TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n", | ||
| " container_name=TRAINING_DATA_CONTAINER_NAME,\n", | ||
| " permissions=ContainerSasPermissions(read=True, write=True, list=True),\n", | ||
| " expiry_hours=1,\n", | ||
| " )\n", | ||
| " from azure.storage.blob import ContainerSasPermissions\n", | ||
| " # Requires \"Write\" (critical for upload/modify/append) along with \"Read\" and \"List\" for viewing/listing blobs.\n", | ||
| " training_data_sas_url = AzureContentUnderstandingClient.generate_temp_container_sas_url(\n", | ||
| " account_name=TRAINING_DATA_STORAGE_ACCOUNT_NAME,\n", | ||
| " container_name=TRAINING_DATA_CONTAINER_NAME,\n", | ||
| " permissions=ContainerSasPermissions(read=True, write=True, list=True),\n", | ||
| " expiry_hours=1,\n", | ||
| " )\n", | ||
| "\n", | ||
| "training_data_path = os.getenv(\"TRAINING_DATA_PATH\")\n", | ||
| "\n", | ||
| "await client.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)" | ||
| "\n", | ||
| "await processor.generate_training_data_on_blob(training_docs_folder, training_data_sas_url, training_data_path)" | ||
| ] | ||
| }, | ||
| { | ||
|
|
@@ -162,7 +178,7 @@ | |
| "## Create Analyzer with Defined Schema\n", | ||
| "Before creating the analyzer, fill in the constant `ANALYZER_ID` with a relevant name for your task. In this example, we generate a unique suffix so that this cell can be run multiple times to create different analyzers.\n", | ||
| "\n", | ||
| "We use **training_data_sas_url** and **training_data_path** as set in the [.env](./.env) file and used in the previous step." | ||
| "We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** as set in the [.env](./.env) file and used in the previous step." | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We changed to lower-case in #60 (comment).
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @changjian-wang I saw your new commits. Sorry for the confusion, I was referring to this specific line. I mean we're using lower-case training_data_sas_url and training_data_path as variables within the notebook. I think we may not need to mention .env file here like "We use training_data_sas_url and training_data_path set in the previous step." |
||
| ] | ||
| }, | ||
| { | ||
|
|
@@ -171,24 +187,80 @@ | |
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "import uuid\n", | ||
| "CUSTOM_ANALYZER_ID = \"train-sample-\" + str(uuid.uuid4())\n", | ||
| "import datetime\n", | ||
| "\n", | ||
| "analyzer_id = f\"analyzer-training-sample-{datetime.now().strftime('%Y%m%d')}-{datetime.now().strftime('%H%M%S')}-{uuid.uuid4().hex[:8]}\"\n", | ||
| "\n", | ||
| "content_analyzer = ContentAnalyzer(\n", | ||
| " base_analyzer_id=\"prebuilt-documentAnalyzer\",\n", | ||
| " description=\"Extract useful information from receipt\",\n", | ||
| " field_schema=FieldSchema(\n", | ||
| " name=\"receipt schema\",\n", | ||
| " description=\"Schema for receipt\",\n", | ||
| " fields={\n", | ||
| " \"MerchantName\": FieldDefinition(\n", | ||
| " type=FieldType.STRING,\n", | ||
| " method=GenerationMethod.EXTRACT,\n", | ||
| " description=\"\"\n", | ||
| " ),\n", | ||
| " \"Items\": FieldDefinition(\n", | ||
| " type=FieldType.ARRAY,\n", | ||
| " method=GenerationMethod.GENERATE,\n", | ||
| " description=\"\",\n", | ||
| " items_property={\n", | ||
| " \"type\": \"object\",\n", | ||
| " \"method\": \"extract\",\n", | ||
| " \"properties\": {\n", | ||
| " \"Quantity\": {\n", | ||
| " \"type\": \"string\",\n", | ||
| " \"method\": \"extract\",\n", | ||
| " \"description\": \"\"\n", | ||
| " },\n", | ||
| " \"Name\": {\n", | ||
| " \"type\": \"string\",\n", | ||
| " \"method\": \"extract\",\n", | ||
| " \"description\": \"\"\n", | ||
| " },\n", | ||
| " \"Price\": {\n", | ||
| " \"type\": \"string\",\n", | ||
| " \"method\": \"extract\",\n", | ||
| " \"description\": \"\"\n", | ||
| " }\n", | ||
| " }\n", | ||
| " }\n", | ||
| " ),\n", | ||
| " \"TotalPrice\": FieldDefinition(\n", | ||
| " type=FieldType.STRING,\n", | ||
| " method=GenerationMethod.EXTRACT,\n", | ||
| " description=\"\"\n", | ||
| " )\n", | ||
| " }\n", | ||
| " ),\n", | ||
| " mode=AnalysisMode.STANDARD,\n", | ||
| " processing_location=ProcessingLocation.GEOGRAPHY,\n", | ||
| " tags={\"demo_type\": \"get_result\"},\n", | ||
| " training_data={\n", | ||
| " \"kind\": \"blob\",\n", | ||
| " \"containerUrl\": training_data_sas_url,\n", | ||
| " \"prefix\": training_data_path\n", | ||
| " },\n", | ||
| ")\n", | ||
| "print(f\"🔧 Creating custom analyzer '{analyzer_id}'...\")\n", | ||
| "poller = await client.content_analyzers.begin_create_or_replace(\n", | ||
| " analyzer_id=analyzer_id,\n", | ||
| " resource=content_analyzer,\n", | ||
| ")\n", | ||
| "\n", | ||
| "response = client.begin_create_analyzer(\n", | ||
| " CUSTOM_ANALYZER_ID,\n", | ||
| " analyzer_template_path=analyzer_template,\n", | ||
| " training_storage_container_sas_url=training_data_sas_url,\n", | ||
| " training_storage_container_path_prefix=training_data_path,\n", | ||
| "# Extract operation ID from the poller\n", | ||
| "operation_id = extract_operation_id_from_poller(\n", | ||
| " poller, PollerType.ANALYZER_CREATION\n", | ||
| ")\n", | ||
| "result = client.poll_result(response)\n", | ||
| "if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n", | ||
| " logging.info(f\"Analyzer details for {result['result']['analyzerId']}\")\n", | ||
| " logging.info(json.dumps(result, indent=2))\n", | ||
| "else:\n", | ||
| " logging.warning(\n", | ||
| " \"An issue was encountered when trying to create the analyzer. \"\n", | ||
| " \"Please double-check your deployment and configurations for potential problems.\"\n", | ||
| " )" | ||
| "print(f\"📋 Extracted creation operation ID: {operation_id}\")\n", | ||
| "\n", | ||
| "# Wait for the analyzer to be created\n", | ||
| "print(f\"⏳ Waiting for analyzer creation to complete...\")\n", | ||
| "await poller.result()\n", | ||
| "print(f\"✅ Analyzer '{analyzer_id}' created successfully!\")" | ||
| ] | ||
| }, | ||
| { | ||
|
|
@@ -205,10 +277,53 @@ | |
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "response = client.begin_analyze(CUSTOM_ANALYZER_ID, file_location='../data/receipt.png')\n", | ||
| "result_json = client.poll_result(response)\n", | ||
| "file_path = \"../data/receipt.png\"\n", | ||
| "print(f\"📄 Reading document file: {file_path}\")\n", | ||
| "with open(file_path, \"rb\") as f:\n", | ||
| " data_content = f.read()\n", | ||
| "\n", | ||
| "# Begin document analysis operation\n", | ||
| "print(f\"🔍 Starting document analysis with analyzer '{analyzer_id}'...\")\n", | ||
| "analysis_poller = await client.content_analyzers.begin_analyze_binary(\n", | ||
| " analyzer_id=analyzer_id, \n", | ||
| " input=data_content,\n", | ||
| " content_type=\"application/octet-stream\")\n", | ||
| "\n", | ||
| "# Wait for analysis completion\n", | ||
| "print(f\"⏳ Waiting for document analysis to complete...\")\n", | ||
| "analysis_result = await analysis_poller.result()\n", | ||
| "print(f\"✅ Document analysis completed successfully!\")\n", | ||
| "\n", | ||
| " # Extract operation ID for get_result\n", | ||
| "analysis_operation_id = extract_operation_id_from_poller(\n", | ||
| " analysis_poller, PollerType.ANALYZE_CALL\n", | ||
| ")\n", | ||
| "print(f\"📋 Extracted analysis operation ID: {analysis_operation_id}\")\n", | ||
| "\n", | ||
| "# Get the analysis result using the operation ID\n", | ||
| "print(\n", | ||
| " f\"🔍 Getting analysis result using operation ID '{analysis_operation_id}'...\"\n", | ||
| ")\n", | ||
| "operation_status = await client.content_analyzers.get_result(\n", | ||
| " operation_id=analysis_operation_id,\n", | ||
| ")\n", | ||
| "\n", | ||
| "print(f\"✅ Analysis result retrieved successfully!\")\n", | ||
| "print(f\" Operation ID: {operation_status.id}\")\n", | ||
| "print(f\" Status: {operation_status.status}\")\n", | ||
| "\n", | ||
| "logging.info(json.dumps(result_json, indent=2))" | ||
| "# The actual analysis result is in operation_status.result\n", | ||
| "operation_result = operation_status.result\n", | ||
| "if operation_result is None:\n", | ||
| " print(\"⚠️ No analysis result available\")\n", | ||
| "\n", | ||
| "print(f\"📄 Analysis Result: {json.dumps(operation_result.as_dict())}\")\n", | ||
| "\n", | ||
| "# Save the analysis result to a file\n", | ||
| "saved_file_path = save_json_to_file(\n", | ||
| " result=operation_result.as_dict(),\n", | ||
| " filename_prefix=\"analyzer_training_get_result\",\n", | ||
| ")" | ||
| ] | ||
| }, | ||
| { | ||
|
|
@@ -225,13 +340,13 @@ | |
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": [ | ||
| "client.delete_analyzer(CUSTOM_ANALYZER_ID)" | ||
| "client.content_analyzers.delete(analyzer_id)" | ||
| ] | ||
| } | ||
| ], | ||
| "metadata": { | ||
| "kernelspec": { | ||
| "display_name": "Python 3", | ||
| "display_name": "py312", | ||
| "language": "python", | ||
| "name": "python3" | ||
| }, | ||
|
|
@@ -245,7 +360,7 @@ | |
| "name": "python", | ||
| "nbconvert_exporter": "python", | ||
| "pygments_lexer": "ipython3", | ||
| "version": "3.11.12" | ||
| "version": "3.12.3" | ||
| } | ||
| }, | ||
| "nbformat": 4, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We had some module errors in auto check. We will need to add it into requirements.txt.