Source code for aws_textract_pipeline.workspace

# -*- coding: utf-8 -*-

"""
todo: add docstring
"""

import dataclasses

from s3pathlib import S3Path


[docs]@dataclasses.dataclass class Workspace: """ Workspace class to hold the S3 directory URI and provide methods to get S3 paths for different stages of the pipeline. :param s3dir_uri: the root S3 directory URI. All the S3 paths are relative to this directory. """ s3dir_uri: str # fmt: off @property def s3dir(self) -> S3Path: return S3Path.from_s3_uri(self.s3dir_uri).to_dir() @property def s3dir_landing(self) -> S3Path: return self.s3dir.joinpath("010-landing").to_dir() @property def s3dir_raw(self) -> S3Path: return self.s3dir.joinpath("020-raw").to_dir() @property def s3dir_component(self) -> S3Path: return self.s3dir.joinpath("030-component").to_dir() @property def s3dir_image(self) -> S3Path: return self.s3dir.joinpath("040-image").to_dir() @property def s3dir_textract_output(self) -> S3Path: return self.s3dir.joinpath("050-textract-output").to_dir() @property def s3dir_text(self) -> S3Path: return self.s3dir.joinpath("060-text").to_dir() @property def s3dir_json(self) -> S3Path: return self.s3dir.joinpath("070-json").to_dir() @property def s3dir_extracted_data(self) -> S3Path: return self.s3dir.joinpath("080-extracted-data").to_dir() @property def s3dir_extracted_data_hil_output(self) -> S3Path: return self.s3dir.joinpath("090-extracted-data-hil-output").to_dir() @property def s3dir_extracted_data_hil_post_process(self) -> S3Path: return self.s3dir.joinpath("090-extracted-data-hil-post-process").to_dir() def get_raw_s3path(self, doc_id: str) -> S3Path: return self.s3dir_raw.joinpath(doc_id) def get_component_s3path(self, doc_id: str, comp_id: str) -> S3Path: return self.s3dir_component.joinpath(doc_id, comp_id) def get_image_s3path(self, doc_id: str, comp_id: str) -> S3Path: return self.s3dir_image.joinpath(doc_id, comp_id) def get_textract_output_s3dir(self, doc_id: str, comp_id: str) -> S3Path: return self.s3dir_textract_output.joinpath(doc_id, comp_id).to_dir() def get_text_s3path(self, doc_id: str, comp_id: str) -> S3Path: return self.s3dir_text.joinpath(doc_id, comp_id) def get_json_s3path(self, doc_id: str, comp_id: str) -> S3Path: return self.s3dir_json.joinpath(doc_id, comp_id) def get_extracted_data_s3path(self, doc_id: str, comp_id: str) -> S3Path: return self.s3dir_extracted_data.joinpath(doc_id, comp_id) def get_extracted_data_hil_output_s3dir(self, doc_id: str, comp_id: str) -> S3Path: return self.s3dir_extracted_data_hil_output.joinpath(doc_id, comp_id).to_dir() def get_extracted_data_hil_post_process_s3path(self, doc_id: str, comp_id: str) -> S3Path: return self.s3dir_extracted_data_hil_post_process.joinpath(doc_id, comp_id)
# fmt: on