diff --git a/test_unstructured_inference/models/test_yolov8.py b/test_unstructured_inference/models/test_yolov8.py new file mode 100644 index 00000000..7616b1ac --- /dev/null +++ b/test_unstructured_inference/models/test_yolov8.py @@ -0,0 +1,99 @@ +import os + +import pytest + +from unstructured_inference.inference.layout import process_file_with_model + + +@pytest.mark.slow() +def test_layout_yolov8_local_parsing_image(): + filename = os.path.join("sample-docs", "test-image.jpg") + # NOTE(benjamin) keep_output = True create a file for each image in + # localstorage for visualization of the result + document_layout = process_file_with_model(filename, model_name="yolov8s", is_image=True) + # NOTE(benjamin) The example image should result in one page result + assert len(document_layout.pages) == 1 + # NOTE(benjamin) The example sent to the test contains 13 detections + types_known = ["Text", "Section-header", "Page-header"] + known_regions = [e for e in document_layout.pages[0].elements if e.type in types_known] + assert len(known_regions) == 13 + assert hasattr( + document_layout.pages[0].elements[0], + "prob", + ) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities + assert isinstance( + document_layout.pages[0].elements[0].prob, + float, + ) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float + + +@pytest.mark.slow() +def test_layout_yolov8_local_parsing_pdf(): + filename = os.path.join("sample-docs", "loremipsum.pdf") + document_layout = process_file_with_model(filename, model_name="yolov8s") + assert len(document_layout.pages) == 1 + # NOTE(benjamin) The example sent to the test contains 5 text detections + text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"] + assert len(text_elements) == 5 + assert hasattr( + document_layout.pages[0].elements[0], + "prob", + ) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities + assert isinstance( + document_layout.pages[0].elements[0].prob, + float, + ) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float + + +@pytest.mark.slow() +def test_layout_yolov8_local_parsing_empty_pdf(): + filename = os.path.join("sample-docs", "empty-document.pdf") + document_layout = process_file_with_model(filename, model_name="yolov8s") + assert len(document_layout.pages) == 1 + # NOTE(benjamin) The example sent to the test contains 0 detections + assert len(document_layout.pages[0].elements) == 0 + + +######################## +# ONLY SHORT TESTS BELOW +######################## + + +def test_layout_yolov8_local_parsing_image_soft(): + filename = os.path.join("sample-docs", "example_table.jpg") + # NOTE(benjamin) keep_output = True create a file for each image in + # localstorage for visualization of the result + document_layout = process_file_with_model(filename, model_name="yolov8s", is_image=True) + # NOTE(benjamin) The example image should result in one page result + assert len(document_layout.pages) == 1 + # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model + assert len(document_layout.pages[0].elements) > 0 + assert hasattr( + document_layout.pages[0].elements[0], + "prob", + ) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities + assert isinstance( + document_layout.pages[0].elements[0].prob, + float, + ) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float + + +def test_layout_yolov8_local_parsing_pdf_soft(): + filename = os.path.join("sample-docs", "loremipsum.pdf") + document_layout = process_file_with_model(filename, model_name="yolov8s") + assert len(document_layout.pages) == 1 + # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model + assert len(document_layout.pages[0].elements) > 0 + assert hasattr( + document_layout.pages[0].elements[0], + "prob", + ) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities + + +def test_layout_yolov8_local_parsing_empty_pdf_soft(): + filename = os.path.join("sample-docs", "empty-document.pdf") + document_layout = process_file_with_model(filename, model_name="yolov8s") + assert len(document_layout.pages) == 1 + # NOTE(benjamin) The example sent to the test contains 0 detections + text_elements_page_1 = [el for el in document_layout.pages[0].elements if el.type != "Image"] + assert len(text_elements_page_1) == 0 diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py index d8139ed2..56c9f43d 100644 --- a/unstructured_inference/constants.py +++ b/unstructured_inference/constants.py @@ -8,6 +8,7 @@ class AnnotationResult(Enum): class Source(Enum): YOLOX = "yolox" + YOLOv8 = "yolov8" DETECTRON2_ONNX = "detectron2_onnx" DETECTRON2_LP = "detectron2_lp" CHIPPER = "chipper" diff --git a/unstructured_inference/models/base.py b/unstructured_inference/models/base.py index 26336e23..74752a4d 100644 --- a/unstructured_inference/models/base.py +++ b/unstructured_inference/models/base.py @@ -26,6 +26,12 @@ from unstructured_inference.models.yolox import ( UnstructuredYoloXModel, ) +from unstructured_inference.models.yolov8 import ( + MODEL_TYPES as YOLOV8_MODEL_TYPES, +) +from unstructured_inference.models.yolov8 import ( + UnstructuredYolov8Model, +) DEFAULT_MODEL = "yolox" @@ -35,6 +41,7 @@ **{name: UnstructuredDetectronModel for name in DETECTRON2_MODEL_TYPES}, **{name: UnstructuredDetectronONNXModel for name in DETECTRON2_ONNX_MODEL_TYPES}, **{name: UnstructuredYoloXModel for name in YOLOX_MODEL_TYPES}, + **{name: UnstructuredYolov8Model for name in YOLOV8_MODEL_TYPES}, **{name: UnstructuredChipperModel for name in CHIPPER_MODEL_TYPES}, "super_gradients": UnstructuredSuperGradients, } @@ -65,6 +72,8 @@ def get_model(model_name: Optional[str] = None) -> UnstructuredModel: initialize_params = DETECTRON2_ONNX_MODEL_TYPES[model_name] elif model_name in YOLOX_MODEL_TYPES: initialize_params = YOLOX_MODEL_TYPES[model_name] + elif model_name in YOLOV8_MODEL_TYPES: + initialize_params = YOLOV8_MODEL_TYPES[model_name] elif model_name in CHIPPER_MODEL_TYPES: initialize_params = CHIPPER_MODEL_TYPES[model_name] else: diff --git a/unstructured_inference/models/yolov8.py b/unstructured_inference/models/yolov8.py new file mode 100644 index 00000000..c1ab0601 --- /dev/null +++ b/unstructured_inference/models/yolov8.py @@ -0,0 +1,95 @@ +from typing import List, cast + +import numpy as np +from huggingface_hub import hf_hub_download +from PIL import Image +from torchvision.ops import nms + +from unstructured_inference.constants import ElementType, Source +from unstructured_inference.inference.layoutelement import LayoutElement +from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel +from unstructured_inference.utils import LazyDict, LazyEvaluateInfo +from ultralytics import YOLO + +YOLOv8_LABEL_MAP = { + 0: ElementType.CAPTION, + 1: ElementType.FOOTNOTE, + 2: ElementType.FORMULA, + 3: ElementType.LIST_ITEM, + 4: ElementType.PAGE_FOOTER, + 5: ElementType.PAGE_HEADER, + 6: ElementType.PICTURE, + 7: ElementType.SECTION_HEADER, + 8: ElementType.TABLE, + 9: ElementType.TEXT, + 10: ElementType.TITLE, +} + +model = YOLO('/home/joao/yolov8n/weights/best.pt') +MODEL_TYPES = { + "yolov8n": LazyDict( + model_path=LazyEvaluateInfo( + hf_hub_download, + "neuralshift/doc-layout-yolov8n", + "weights/best.pt", + ), + label_map=YOLOv8_LABEL_MAP, + ), + "yolov8s": LazyDict( + model_path=LazyEvaluateInfo( + hf_hub_download, + "neuralshift/doc-layout-yolov8s", + "weights/best.pt", + ), + label_map=YOLOv8_LABEL_MAP, + ), +} + + +class UnstructuredYolov8Model(UnstructuredObjectDetectionModel): + def predict(self, x: Image): + """Predict using Yolov8 model.""" + super().predict(x) + return self.image_processing(x) + + def initialize(self, model_path: str, label_map: dict): + """Start inference session for Yolov8 model.""" + self.model = YOLO(model=model_path) + self.layout_classes = label_map + + def image_processing( + self, + image: Image = None, + ) -> List[LayoutElement]: + """Method runing Yolov8 for layout detection, returns a list of + LayoutElement + ---------- + image + Image to process + """ + input_shape = (640, 640) + processed_image = image.resize(input_shape, Image.BILINEAR) + ratio = np.array(input_shape) / np.array(image.size) + + # NMS + boxes = self.model(processed_image, verbose=False)[0].boxes + valid_boxes = nms(boxes.xyxy, boxes.conf, 0.1) + boxes = boxes[valid_boxes] + boxes = boxes[boxes.conf > 0.3] + + regions = sorted([ + LayoutElement.from_coords( + box.xyxy[0][0].item() / ratio[0], + box.xyxy[0][1].item() / ratio[1], + box.xyxy[0][2].item() / ratio[0], + box.xyxy[0][3].item() / ratio[1], + text=None, + type=self.layout_classes[int(box.cls.item())], + prob=box.conf.item(), + source=Source.YOLOv8, + ) for box in boxes + ], key=lambda element: element.bbox.y1) + + page_layout = cast(List[LayoutElement], regions) # TODO(benjamin): encode image as base64? + + return page_layout