expense_monitor/app/services/ocr.py

from __future__ import annotations

import re
from pathlib import Path

import pytesseract
from PIL import Image, ImageOps

AMOUNT_REGEXES = [
    re.compile(r'(?:suma|total|razem)\s*[:]?\s*(\d+[\.,]\d{2})', re.I),
    re.compile(r'(\d+[\.,]\d{2})'),
]
DATE_REGEX = re.compile(r'(\d{4}-\d{2}-\d{2}|\d{2}[./-]\d{2}[./-]\d{4})')
NIP_REGEX = re.compile(r'(?:NIP|TIN)\s*[:]?\s*([0-9\- ]{8,20})', re.I)


class OCRResult(dict):
    @property
    def status(self) -> str:
        return self.get('status', 'pending')


class OCRService:
    def extract(self, file_path: Path) -> OCRResult:
        if file_path.suffix.lower() not in {'.jpg', '.jpeg', '.png', '.heic', '.webp'}:
            return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None)
        try:
            image = Image.open(file_path)
            image = ImageOps.exif_transpose(image)
            text = pytesseract.image_to_string(image, lang='eng')
        except Exception:
            return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None)

        lines = [line.strip() for line in text.splitlines() if line.strip()]
        vendor = lines[0][:255] if lines else ''
        amount = None
        for pattern in AMOUNT_REGEXES:
            match = pattern.search(text)
            if match:
                amount = match.group(1).replace(',', '.')
                break
        date_match = DATE_REGEX.search(text)
        nip_match = NIP_REGEX.search(text)
        return OCRResult(
            status='review' if amount or vendor else 'pending',
            title=vendor or file_path.stem,
            vendor=vendor,
            amount=amount,
            purchase_date=date_match.group(1) if date_match else None,
            tax_id=nip_match.group(1).strip() if nip_match else None,
            raw_text=text[:4000],
        )