From ce27b2bcd1aeb0896f1946670f69cff758809533 Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Mon, 10 Feb 2025 23:08:56 -0800 Subject: [PATCH 1/3] Suggested changes (code untested) to tutorial. --- docparse/async_tutorial.mdx | 42 ++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/docparse/async_tutorial.mdx b/docparse/async_tutorial.mdx index b171050..c38aec7 100644 --- a/docparse/async_tutorial.mdx +++ b/docparse/async_tutorial.mdx @@ -13,9 +13,9 @@ The following functions can be used together to send files to DocParse to partit Here’s an example of how you can process multiple files at the same time: ```python +import os import time from aryn_sdk.partition import partition_file_async_submit, partition_file_async_result -import os ## Get a list of all files you are interested in parsing files = os.walk()[2] @@ -25,28 +25,28 @@ task_ids = [None] * len(files) ## and create a list of running tasks for i, file_name in enumerate(files): try: - task_ids[i] = partition_file_async_submit(open(file_name, 'rb'))["task_id"] + task_ids[i] = partition_file_async_submit(file_name))["task_id"] except Exception as e: print(f"Failed to submit {f}: {e}") results = [None] * len(files) ## Wait for all tasks to finish -for i, task_id in enumerate(task_ids): - while True: +count = len(task_ids) +while count > 0: + for i, task_id in enumerate(task_ids): result = partition_file_async_result(task_id) - # if particular task is done, break - if result["status"] != "pending": + status = result["status"] + # if particular task is done, note it + if status != "pending": print(f"Task {task_id} done.") - break - - # else sleep - time.sleep(1) - - if result["status"] == "done": - results[i] = result - -## print the results will be None if task failed + count -= 1 + if status == "done": + results[i] = result + else: + time.sleep(1) + +## print the results; will be None if task failed for result in results: print(result) ``` @@ -56,8 +56,8 @@ for result in results: Optionally, you can also set a webhook for Aryn's services to call when your task is completed: ```python -f = open("path/to/my/file.docx") -partition_file_async_submit(f, webhook_url="https://example.com/alert") +fn = "path/to/my/file.docx" +partition_file_async_submit(fn, webhook_url="https://example.com/alert") ``` Aryn will POST a request containing a body like the below to the webhook URL: @@ -66,6 +66,8 @@ Aryn will POST a request containing a body like the below to the webhook URL: {"done": [{"task_id": "aryn:t-47gpd3604e5tz79z1jro5fc"}]} ``` +## List + If you want to list all the asynchronous partition_file tasks that are running and not yet completed in your account, you can call the following function: ```python @@ -78,6 +80,8 @@ If you want to list all the asynchronous partition_file tasks that are running a 'aryn:t-luldbdt5d2kn8cact61mao8': {'state': 'pending'}} ``` +## Cancel + If you want to cancel a particular asynchronous partition_file task you can call the following function: ```python @@ -87,7 +91,7 @@ True ## Using cURL -You can also call the async APIs directly without python through cURL: +You can also call the async APIs directly without Python through cURL: ```bash curl -X POST https://api.aryn.cloud/v1/async/submit/document/partition -F "file=@path/to/file.pdf" -F 'options={"use_ocr": true}' -H "Authorization: Bearer MY_ARYN_API_KEY" @@ -109,4 +113,4 @@ You can cancel a pending task with the command below: ```bash curl -X POST https://api.aryn.cloud/v1/async/cancel/aryn:t-bipwgqesiqixfz2kyew3j8d -H "Authorization: Bearer MY_ARYN_API_KEY" -``` \ No newline at end of file +``` From 7fd3f6a3e72b10de12ca238ac6f6edbab7c9d29d Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Mon, 10 Feb 2025 23:24:08 -0800 Subject: [PATCH 2/3] fixup --- docparse/async_tutorial.mdx | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/docparse/async_tutorial.mdx b/docparse/async_tutorial.mdx index c38aec7..73f98b2 100644 --- a/docparse/async_tutorial.mdx +++ b/docparse/async_tutorial.mdx @@ -25,9 +25,9 @@ task_ids = [None] * len(files) ## and create a list of running tasks for i, file_name in enumerate(files): try: - task_ids[i] = partition_file_async_submit(file_name))["task_id"] + task_ids[i] = partition_file_async_submit(file_name))["task_id"] except Exception as e: - print(f"Failed to submit {f}: {e}") + print(f"Failed to submit {f}: {e}") results = [None] * len(files) @@ -35,16 +35,18 @@ results = [None] * len(files) count = len(task_ids) while count > 0: for i, task_id in enumerate(task_ids): - result = partition_file_async_result(task_id) - status = result["status"] - # if particular task is done, note it - if status != "pending": - print(f"Task {task_id} done.") - count -= 1 - if status == "done": - results[i] = result - else: - time.sleep(1) + if task_id: + result = partition_file_async_result(task_id) + status = result["status"] + # if particular task is done, note it + if status != "pending": + print(f"Task {task_id} done.") + count -= 1 + task_ids[i] == None + if status == "done": + results[i] = result + else: + time.sleep(1) ## print the results; will be None if task failed for result in results: From 774e4824427d1175841cc8a9b705aad93587d7a9 Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Mon, 10 Feb 2025 23:49:27 -0800 Subject: [PATCH 3/3] Improved waiting via maps (needs testing). --- docparse/async_tutorial.mdx | 45 ++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/docparse/async_tutorial.mdx b/docparse/async_tutorial.mdx index 73f98b2..002793e 100644 --- a/docparse/async_tutorial.mdx +++ b/docparse/async_tutorial.mdx @@ -19,38 +19,37 @@ from aryn_sdk.partition import partition_file_async_submit, partition_file_async ## Get a list of all files you are interested in parsing files = os.walk()[2] -task_ids = [None] * len(files) ## Iterate through all the files to submit a task to partition each file ## and create a list of running tasks -for i, file_name in enumerate(files): +task_to_file = {} +for file_name in enumerate(files): try: - task_ids[i] = partition_file_async_submit(file_name))["task_id"] + task_id = partition_file_async_submit(file_name))["task_id"] + task_to_file[task_id] = file_name except Exception as e: - print(f"Failed to submit {f}: {e}") - -results = [None] * len(files) + print(f"Failed to submit {file_name}: {e}") ## Wait for all tasks to finish -count = len(task_ids) -while count > 0: - for i, task_id in enumerate(task_ids): - if task_id: - result = partition_file_async_result(task_id) - status = result["status"] - # if particular task is done, note it - if status != "pending": - print(f"Task {task_id} done.") - count -= 1 - task_ids[i] == None - if status == "done": - results[i] = result - else: - time.sleep(1) +file_to_result = {} +while task_to_file: + retired = [] + for task_id, file_name in task_to_file.items(): + result = partition_file_async_result(task_id) + status = result["status"] + # if particular task is done, note it + if status != "pending": + print(f"Task {task_id} for {file_name} done.") + retired.append(task_id) + file_to_result[file_name] = result if status == "done" else None + else: + time.sleep(1) + for task_id in retired: + del task_to_file[task_id] ## print the results; will be None if task failed -for result in results: - print(result) +for file_name, result in file_to_result.items() + print(file_name, result) ``` ## Using a webHook