@@ -24,103 +24,103 @@ def doc_path() -> Path:
2424 return Path (__file__ ).resolve ().parents [2 ] / "_sample_docs"
2525
2626
27- @pytest .mark .parametrize ("split_pdf" , [True , False ])
28- @pytest .mark .parametrize ("strategy" , ["fast" , "ocr_only" , "hi_res" ])
29- def test_partition_strategies (split_pdf , strategy , client , doc_path ):
30- filename = "layout-parser-paper-fast.pdf"
31- with open (doc_path / filename , "rb" ) as f :
32- files = shared .Files (
33- content = f .read (),
34- file_name = filename ,
35- )
36-
37- req = operations .PartitionRequest (
38- partition_parameters = shared .PartitionParameters (
39- files = files ,
40- strategy = strategy ,
41- languages = ["eng" ],
42- split_pdf_page = split_pdf ,
43- )
44- )
45-
46- response = client .general .partition (
47- request = req
48- )
49- assert response .status_code == 200
50- assert len (response .elements )
51-
52-
53- @pytest .mark .parametrize ("split_pdf" , [True , False ])
54- @pytest .mark .parametrize ("error" , [(500 , ServerError ), (403 , SDKError ), (422 , HTTPValidationError )])
55- def test_partition_handling_server_error (error , split_pdf , monkeypatch , doc_path ):
56- """
57- Mock different error responses, assert that the client throws the correct error
58- """
59- filename = "layout-parser-paper-fast.pdf"
60- import httpx
61-
62- error_code , sdk_raises = error
63-
64- # Create the mock response
65- json_data = {"detail" : "An error occurred" }
66- response = httpx .Response (
67- status_code = error_code ,
68- headers = {'Content-Type' : 'application/json' },
69- content = json .dumps (json_data ),
70- request = httpx .Request ("POST" , "http://mock-request" ),
71- )
72-
73- monkeypatch .setattr (httpx .AsyncClient , "send" , lambda * args , ** kwargs : response )
74- monkeypatch .setattr (httpx .Client , "send" , lambda * args , ** kwargs : response )
75-
76- # initialize client after patching
77- client = UnstructuredClient (
78- api_key_auth = os .getenv ("UNSTRUCTURED_API_KEY" ),
79- retry_config = RetryConfig ("backoff" , BackoffStrategy (1 , 10 , 1.5 , 30 ), False ),
80- )
81-
82- with open (doc_path / filename , "rb" ) as f :
83- files = shared .Files (
84- content = f .read (),
85- file_name = filename ,
86- )
87-
88- req = operations .PartitionRequest (
89- partition_parameters = shared .PartitionParameters (
90- files = files ,
91- strategy = "fast" ,
92- languages = ["eng" ],
93- split_pdf_page = split_pdf ,
94- )
95- )
96-
97- with pytest .raises (sdk_raises ):
98- response = client .general .partition (
99- request = req
100- )
101-
102-
103- @pytest .mark .asyncio
104- async def test_partition_async_returns_elements (client , doc_path ):
105- filename = "layout-parser-paper.pdf"
106- with open (doc_path / filename , "rb" ) as f :
107- files = shared .Files (
108- content = f .read (),
109- file_name = filename ,
110- )
111-
112- req = operations .PartitionRequest (
113- partition_parameters = shared .PartitionParameters (
114- files = files ,
115- strategy = "fast" ,
116- languages = ["eng" ],
117- split_pdf_page = True ,
118- )
119- )
120-
121- response = await client .general .partition_async (request = req )
122- assert response .status_code == 200
123- assert len (response .elements )
27+ # @pytest.mark.parametrize("split_pdf", [True, False])
28+ # @pytest.mark.parametrize("strategy", ["fast", "ocr_only", "hi_res"])
29+ # def test_partition_strategies(split_pdf, strategy, client, doc_path):
30+ # filename = "layout-parser-paper-fast.pdf"
31+ # with open(doc_path / filename, "rb") as f:
32+ # files = shared.Files(
33+ # content=f.read(),
34+ # file_name=filename,
35+ # )
36+
37+ # req = operations.PartitionRequest(
38+ # partition_parameters=shared.PartitionParameters(
39+ # files=files,
40+ # strategy=strategy,
41+ # languages=["eng"],
42+ # split_pdf_page=split_pdf,
43+ # )
44+ # )
45+
46+ # response = client.general.partition(
47+ # request=req
48+ # )
49+ # assert response.status_code == 200
50+ # assert len(response.elements)
51+
52+
53+ # @pytest.mark.parametrize("split_pdf", [True, False])
54+ # @pytest.mark.parametrize("error", [(500, ServerError), (403, SDKError), (422, HTTPValidationError)])
55+ # def test_partition_handling_server_error(error, split_pdf, monkeypatch, doc_path):
56+ # """
57+ # Mock different error responses, assert that the client throws the correct error
58+ # """
59+ # filename = "layout-parser-paper-fast.pdf"
60+ # import httpx
61+
62+ # error_code, sdk_raises = error
63+
64+ # # Create the mock response
65+ # json_data = {"detail": "An error occurred"}
66+ # response = httpx.Response(
67+ # status_code=error_code,
68+ # headers={'Content-Type': 'application/json'},
69+ # content=json.dumps(json_data),
70+ # request=httpx.Request("POST", "http://mock-request"),
71+ # )
72+
73+ # monkeypatch.setattr(httpx.AsyncClient, "send", lambda *args, **kwargs: response)
74+ # monkeypatch.setattr(httpx.Client, "send", lambda *args, **kwargs: response)
75+
76+ # # initialize client after patching
77+ # client = UnstructuredClient(
78+ # api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
79+ # retry_config=RetryConfig("backoff", BackoffStrategy(1, 10, 1.5, 30), False),
80+ # )
81+
82+ # with open(doc_path / filename, "rb") as f:
83+ # files = shared.Files(
84+ # content=f.read(),
85+ # file_name=filename,
86+ # )
87+
88+ # req = operations.PartitionRequest(
89+ # partition_parameters=shared.PartitionParameters(
90+ # files=files,
91+ # strategy="fast",
92+ # languages=["eng"],
93+ # split_pdf_page=split_pdf,
94+ # )
95+ # )
96+
97+ # with pytest.raises(sdk_raises):
98+ # response = client.general.partition(
99+ # request=req
100+ # )
101+
102+
103+ # @pytest.mark.asyncio
104+ # async def test_partition_async_returns_elements(client, doc_path):
105+ # filename = "layout-parser-paper.pdf"
106+ # with open(doc_path / filename, "rb") as f:
107+ # files = shared.Files(
108+ # content=f.read(),
109+ # file_name=filename,
110+ # )
111+
112+ # req = operations.PartitionRequest(
113+ # partition_parameters=shared.PartitionParameters(
114+ # files=files,
115+ # strategy="fast",
116+ # languages=["eng"],
117+ # split_pdf_page=True,
118+ # )
119+ # )
120+
121+ # response = await client.general.partition_async(request=req)
122+ # assert response.status_code == 200
123+ # assert len(response.elements)
124124
125125
126126@pytest .mark .asyncio
@@ -257,88 +257,88 @@ def test_partition_strategy_vlm_openai(split_pdf, vlm_model, vlm_model_provider,
257257 assert response .elements [0 ]["metadata" ]["partitioner_type" ] == "vlm_partition"
258258
259259
260- @pytest .mark .parametrize ("split_pdf" , [True , False ])
261- @pytest .mark .parametrize ("vlm_model" ,
262- [
263- "us.amazon.nova-pro-v1:0" ,
264- "us.amazon.nova-lite-v1:0" ,
265- "us.anthropic.claude-3-5-sonnet-20241022-v2:0" ,
266- "us.anthropic.claude-3-opus-20240229-v1:0" ,
267- "us.anthropic.claude-3-haiku-20240307-v1:0" ,
268- "us.anthropic.claude-3-sonnet-20240229-v1:0" ,
269- "us.meta.llama3-2-90b-instruct-v1:0" ,
270- "us.meta.llama3-2-11b-instruct-v1:0" ,
271- ]
272- )
273- @pytest .mark .parametrize ("vlm_model_provider" , ["bedrock" ])
274- @pytest .mark .parametrize (
275- "filename" ,
276- [
277- "layout-parser-paper-fast.pdf" ,
278- "fake-power-point.ppt" ,
279- "embedded-images-tables.jpg" ,
280- ]
281- )
282- def test_partition_strategy_vlm_bedrock (split_pdf , vlm_model , vlm_model_provider , client , doc_path , filename ):
283- with open (doc_path / filename , "rb" ) as f :
284- files = shared .Files (
285- content = f .read (),
286- file_name = filename ,
287- )
288-
289- req = operations .PartitionRequest (
290- partition_parameters = shared .PartitionParameters (
291- files = files ,
292- strategy = "vlm" ,
293- vlm_model = vlm_model ,
294- vlm_model_provider = vlm_model_provider ,
295- languages = ["eng" ],
296- split_pdf_page = split_pdf ,
297- )
298- )
299-
300- response = client .general .partition (
301- request = req
302- )
303- assert response .status_code == 200
304- assert len (response .elements ) > 0
305- assert response .elements [0 ]["metadata" ]["partitioner_type" ] == "vlm_partition"
306-
307- @pytest .mark .parametrize ("split_pdf" , [True , False ])
308- @pytest .mark .parametrize ("vlm_model" , ["claude-3-5-sonnet-20241022" ,])
309- @pytest .mark .parametrize ("vlm_model_provider" , ["anthropic" ])
310- @pytest .mark .parametrize (
311- "filename" ,
312- [
313- "layout-parser-paper-fast.pdf" ,
314- "fake-power-point.ppt" ,
315- "embedded-images-tables.jpg" ,
316- ]
317- )
318- def test_partition_strategy_vlm_anthropic (split_pdf , vlm_model , vlm_model_provider , client , doc_path , filename ):
319- with open (doc_path / filename , "rb" ) as f :
320- files = shared .Files (
321- content = f .read (),
322- file_name = filename ,
323- )
324-
325- req = operations .PartitionRequest (
326- partition_parameters = shared .PartitionParameters (
327- files = files ,
328- strategy = "vlm" ,
329- vlm_model = vlm_model ,
330- vlm_model_provider = vlm_model_provider ,
331- languages = ["eng" ],
332- split_pdf_page = split_pdf ,
333- )
334- )
335-
336- response = client .general .partition (
337- request = req
338- )
339- assert response .status_code == 200
340- assert len (response .elements ) > 0
341- assert response .elements [0 ]["metadata" ]["partitioner_type" ] == "vlm_partition"
260+ # @pytest.mark.parametrize("split_pdf", [True, False])
261+ # @pytest.mark.parametrize("vlm_model",
262+ # [
263+ # "us.amazon.nova-pro-v1:0",
264+ # "us.amazon.nova-lite-v1:0",
265+ # "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
266+ # "us.anthropic.claude-3-opus-20240229-v1:0",
267+ # "us.anthropic.claude-3-haiku-20240307-v1:0",
268+ # "us.anthropic.claude-3-sonnet-20240229-v1:0",
269+ # "us.meta.llama3-2-90b-instruct-v1:0",
270+ # "us.meta.llama3-2-11b-instruct-v1:0",
271+ # ]
272+ # )
273+ # @pytest.mark.parametrize("vlm_model_provider", ["bedrock"])
274+ # @pytest.mark.parametrize(
275+ # "filename",
276+ # [
277+ # "layout-parser-paper-fast.pdf",
278+ # "fake-power-point.ppt",
279+ # "embedded-images-tables.jpg",
280+ # ]
281+ # )
282+ # def test_partition_strategy_vlm_bedrock(split_pdf, vlm_model, vlm_model_provider, client, doc_path, filename):
283+ # with open(doc_path / filename, "rb") as f:
284+ # files = shared.Files(
285+ # content=f.read(),
286+ # file_name=filename,
287+ # )
288+
289+ # req = operations.PartitionRequest(
290+ # partition_parameters=shared.PartitionParameters(
291+ # files=files,
292+ # strategy="vlm",
293+ # vlm_model=vlm_model,
294+ # vlm_model_provider=vlm_model_provider,
295+ # languages=["eng"],
296+ # split_pdf_page=split_pdf,
297+ # )
298+ # )
299+
300+ # response = client.general.partition(
301+ # request=req
302+ # )
303+ # assert response.status_code == 200
304+ # assert len(response.elements) > 0
305+ # assert response.elements[0]["metadata"]["partitioner_type"] == "vlm_partition"
306+
307+ # @pytest.mark.parametrize("split_pdf", [True, False])
308+ # @pytest.mark.parametrize("vlm_model", ["claude-3-5-sonnet-20241022",])
309+ # @pytest.mark.parametrize("vlm_model_provider", ["anthropic"])
310+ # @pytest.mark.parametrize(
311+ # "filename",
312+ # [
313+ # "layout-parser-paper-fast.pdf",
314+ # "fake-power-point.ppt",
315+ # "embedded-images-tables.jpg",
316+ # ]
317+ # )
318+ # def test_partition_strategy_vlm_anthropic(split_pdf, vlm_model, vlm_model_provider, client, doc_path, filename):
319+ # with open(doc_path / filename, "rb") as f:
320+ # files = shared.Files(
321+ # content=f.read(),
322+ # file_name=filename,
323+ # )
324+
325+ # req = operations.PartitionRequest(
326+ # partition_parameters=shared.PartitionParameters(
327+ # files=files,
328+ # strategy="vlm",
329+ # vlm_model=vlm_model,
330+ # vlm_model_provider=vlm_model_provider,
331+ # languages=["eng"],
332+ # split_pdf_page=split_pdf,
333+ # )
334+ # )
335+
336+ # response = client.general.partition(
337+ # request=req
338+ # )
339+ # assert response.status_code == 200
340+ # assert len(response.elements) > 0
341+ # assert response.elements[0]["metadata"]["partitioner_type"] == "vlm_partition"
342342
343343
344344def test_returns_422_for_invalid_pdf (
0 commit comments