Source code for algomancy_data.file

"""Lightweight file abstractions used by extractors.

This module defines simple wrappers around uploaded or on-disk files and
provides helpers to normalize their contents to strings/JSON that the
extractors can consume.
"""

from abc import ABC
from io import BytesIO
from typing import Dict
import pandas as pd
import json
import base64

from .schema import FileExtension



[docs]
class File(ABC):
    """Base file representation with name, extension, and content source."""

    def __init__(
        self,
        name: str,
        extension: FileExtension,
        path: str = None,
        content: str = None,
    ):
        self.name: str = name
        self.path: str | None = path
        self.extension: FileExtension = extension
        self.content = None

        if content is not None:
            self.content: str = content
        elif path is not None:
            self.content = self.read_contents_from_path()


[docs]
    def read_contents_from_path(self) -> str:
        """Read textual contents of the file from ``self.path``."""
        with open(self.path, "r") as f:
            return f.read()





[docs]
class CSVFile(File):
    """CSV file backed by uploader content or a filesystem path."""

    def __init__(
        self,
        name: str,
        path: str = None,
        content: str = None,
    ):
        super().__init__(name, FileExtension.CSV, path, None)
        if content is not None:
            self.content = self._set_content_from_uploader(content)

    @staticmethod
    def _set_content_from_uploader(content: str) -> str:
        """Decode data-URI CSV content and return it as UTF-8 text."""
        # Extract the base64 content from the data URI
        content_type, content_string = content.split(",", 1)
        decoded = base64.b64decode(content_string)

        # transform to textformat
        csv_file = decoded.decode("utf-8")

        return csv_file




[docs]
class JSONFile(File):
    """JSON file backed by uploader content or a filesystem path."""

    def __init__(
        self,
        name: str,
        path: str = None,
        content: str = None,
    ):
        super().__init__(name, FileExtension.JSON, path, None)
        if content is not None:
            self.content = self._set_content_from_uploader(content)

    @staticmethod
    def _set_content_from_uploader(content: str) -> str:
        """Decode data-URI JSON content and return a canonical JSON string."""
        # Extract the base64 content from the data URI
        content_type, content_string = content.split(",", 1)
        decoded = base64.b64decode(content_string)

        # Transform the decoded data to json
        json_data = json.loads(decoded)

        return json.dumps(json_data)




[docs]
class XLSXFile(File):
    """Excel file that exposes its sheets as a JSON payload.

    The content string contains JSON with metadata (sheet names and order)
    and a ``sheets`` mapping where each sheet is a list of records.
    """

    def __init__(self, name: str, path: str = None, content: str = None):
        super().__init__(name, FileExtension.XLSX, path, None)
        self.index_to_sheet_name: Dict[int, str] = {}
        if content is not None:
            self.content: str = self._set_content_from_uploader(content)

    def _set_content_from_uploader(self, content: str) -> str:
        """
        XLSX content is treated individually. The content is extracted from the data URI,
        and stored in a dictionary. Each sheet is converted to JSON and stored, to allow
        the XLSX extractor to access the appropriate sheet.
        """
        # Extract the base64 content from the data URI
        content_type, content_string = content.split(",", 1)
        decoded = base64.b64decode(content_string)

        # Use BytesIO instead of StringIO for binary data
        excel_file = pd.ExcelFile(BytesIO(decoded))

        # Return as JSON string
        return self._process_excel_file(excel_file)


[docs]
    def read_contents_from_path(self) -> str:
        """Read and convert the Excel file to the standard JSON payload."""
        # Read all sheets from the Excel file
        excel_file = pd.ExcelFile(self.path)
        return self._process_excel_file(excel_file)


    def _process_excel_file(self, excel_file):
        """Convert all sheets in the given ``ExcelFile`` to a JSON payload."""
        sheet_names = excel_file.sheet_names

        # Store mapping of index to sheet name
        self.index_to_sheet_name = {i: name for i, name in enumerate(sheet_names)}

        # Read each sheet and convert to JSON
        all_sheets_data = {}
        for i, sheet_name in enumerate(sheet_names):
            df = pd.read_excel(excel_file, sheet_name=sheet_name)
            json_data = df.to_json(orient="records")
            # self.sheet_data[sheet_name] = json_data
            all_sheets_data[sheet_name] = json.loads(json_data)

        # Create a combined structure with metadata and all sheets
        result = {
            "metadata": {
                "sheet_count": len(sheet_names),
                "sheet_names": sheet_names,
                "index_to_sheet_name": self.index_to_sheet_name,
            },
            "sheets": all_sheets_data,
        }

        # Return as JSON string
        return json.dumps(result)