Source code for aws_textract_pipeline.segment

# -*- coding: utf-8 -*-

"""
This module provides utilities to segment a document into components.

See:

- :class:`SegmentPdfResult`
- :func:`segment_pdf`
"""

import typing as T
import io
import dataclasses

import fitz
from .vendor.better_dataclasses import DataClass


[docs]@dataclasses.dataclass class SegmentPdfResult(DataClass): """ Returned object of :func:`segment_pdf`. To save ``fitz.Document`` object to local file, use the following code:: >>> res = SegmentPdfResult(...) >>> page = res.page_pdf_list[0] >>> page.save("/path/to/save/page.pdf") To save ``fitz.Pixmap`` object to local file, use the following code:: >>> res = SegmentPdfResult(...) >>> pixmap = res.page_image_list[0] >>> pixmap.save("/path/to/save/image.png", output="png") To get width and height of the image, use the following code:: >>> pixmap.width >>> pixmap.height """ page_pdf_list: T.List[fitz.Document] = dataclasses.field(default_factory=list) page_image_list: T.List[fitz.Pixmap] = dataclasses.field(default_factory=list)
[docs]def segment_pdf( pdf_content: bytes, dpi: int = 200, ) -> SegmentPdfResult: """ Segment PDF into pages. :param pdf_content: PDF content in bytes. :param dpi: DPI of the image. """ # read original PDF into memory pdf = fitz.Document(stream=pdf_content) # Repair any issues (hopefully) before we hit them # See this https://github.com/pymupdf/PyMuPDF/issues/856 buffer = io.BytesIO() # write the document to in-memory buffer buffer.write(pdf.write(clean=True, garbage=4)) new_content = buffer.getvalue() buffer.close() pdf_cleaned = fitz.Document(stream=new_content) page_pdf_list = list() page_image_list = list() for page_num, page in enumerate(pdf_cleaned, start=1): # extract page as PDF pdf_page = fitz.Document() pdf_page.insert_pdf( pdf_cleaned, from_page=page_num - 1, to_page=page_num - 1, ) page_pdf_list.append(pdf_page) # extract page as image pixmap = page.get_pixmap(dpi=dpi) page_image_list.append(pixmap) return SegmentPdfResult( page_pdf_list=page_pdf_list, page_image_list=page_image_list, )
def segment_word( word_content: bytes, ): # pragma: no cover raise NotImplementedError def segment_excel( excel_content: bytes, ): # pragma: no cover raise NotImplementedError def segment_ppt( ppt_content: bytes, ): # pragma: no cover raise NotImplementedError