"""Lightweight file abstractions used by extractors.
This module defines simple wrappers around uploaded or on-disk files and
provides helpers to normalize their contents to strings/JSON that the
extractors can consume.
"""
from abc import ABC
from io import BytesIO
from typing import Dict
import pandas as pd
import json
import base64
from .schema import FileExtension
[docs]
class File(ABC):
"""Base file representation with name, extension, and content source."""
def __init__(
self,
name: str,
extension: FileExtension,
path: str = None,
content: str = None,
):
self.name: str = name
self.path: str | None = path
self.extension: FileExtension = extension
self.content = None
if content is not None:
self.content: str = content
elif path is not None:
self.content = self.read_contents_from_path()
[docs]
def read_contents_from_path(self) -> str:
"""Read textual contents of the file from ``self.path``."""
with open(self.path, "r") as f:
return f.read()
[docs]
class CSVFile(File):
"""CSV file backed by uploader content or a filesystem path."""
def __init__(
self,
name: str,
path: str = None,
content: str = None,
):
super().__init__(name, FileExtension.CSV, path, None)
if content is not None:
self.content = self._set_content_from_uploader(content)
@staticmethod
def _set_content_from_uploader(content: str) -> str:
"""Decode data-URI CSV content and return it as UTF-8 text."""
# Extract the base64 content from the data URI
content_type, content_string = content.split(",", 1)
decoded = base64.b64decode(content_string)
# transform to textformat
csv_file = decoded.decode("utf-8")
return csv_file
[docs]
class JSONFile(File):
"""JSON file backed by uploader content or a filesystem path."""
def __init__(
self,
name: str,
path: str = None,
content: str = None,
):
super().__init__(name, FileExtension.JSON, path, None)
if content is not None:
self.content = self._set_content_from_uploader(content)
@staticmethod
def _set_content_from_uploader(content: str) -> str:
"""Decode data-URI JSON content and return a canonical JSON string."""
# Extract the base64 content from the data URI
content_type, content_string = content.split(",", 1)
decoded = base64.b64decode(content_string)
# Transform the decoded data to json
json_data = json.loads(decoded)
return json.dumps(json_data)
[docs]
class XLSXFile(File):
"""Excel file that exposes its sheets as a JSON payload.
The content string contains JSON with metadata (sheet names and order)
and a ``sheets`` mapping where each sheet is a list of records.
"""
def __init__(self, name: str, path: str = None, content: str = None):
super().__init__(name, FileExtension.XLSX, path, None)
self.index_to_sheet_name: Dict[int, str] = {}
if content is not None:
self.content: str = self._set_content_from_uploader(content)
def _set_content_from_uploader(self, content: str) -> str:
"""
XLSX content is treated individually. The content is extracted from the data URI,
and stored in a dictionary. Each sheet is converted to JSON and stored, to allow
the XLSX extractor to access the appropriate sheet.
"""
# Extract the base64 content from the data URI
content_type, content_string = content.split(",", 1)
decoded = base64.b64decode(content_string)
# Use BytesIO instead of StringIO for binary data
excel_file = pd.ExcelFile(BytesIO(decoded))
# Return as JSON string
return self._process_excel_file(excel_file)
[docs]
def read_contents_from_path(self) -> str:
"""Read and convert the Excel file to the standard JSON payload."""
# Read all sheets from the Excel file
excel_file = pd.ExcelFile(self.path)
return self._process_excel_file(excel_file)
def _process_excel_file(self, excel_file):
"""Convert all sheets in the given ``ExcelFile`` to a JSON payload."""
sheet_names = excel_file.sheet_names
# Store mapping of index to sheet name
self.index_to_sheet_name = {i: name for i, name in enumerate(sheet_names)}
# Read each sheet and convert to JSON
all_sheets_data = {}
for i, sheet_name in enumerate(sheet_names):
df = pd.read_excel(excel_file, sheet_name=sheet_name)
json_data = df.to_json(orient="records")
# self.sheet_data[sheet_name] = json_data
all_sheets_data[sheet_name] = json.loads(json_data)
# Create a combined structure with metadata and all sheets
result = {
"metadata": {
"sheet_count": len(sheet_names),
"sheet_names": sheet_names,
"index_to_sheet_name": self.index_to_sheet_name,
},
"sheets": all_sheets_data,
}
# Return as JSON string
return json.dumps(result)