from __future__ import annotations import re from pathlib import Path import pytesseract from PIL import Image, ImageOps AMOUNT_REGEXES = [ re.compile(r'(?:suma|total|razem)\s*[:]?\s*(\d+[\.,]\d{2})', re.I), re.compile(r'(\d+[\.,]\d{2})'), ] DATE_REGEX = re.compile(r'(\d{4}-\d{2}-\d{2}|\d{2}[./-]\d{2}[./-]\d{4})') NIP_REGEX = re.compile(r'(?:NIP|TIN)\s*[:]?\s*([0-9\- ]{8,20})', re.I) class OCRResult(dict): @property def status(self) -> str: return self.get('status', 'pending') class OCRService: def extract(self, file_path: Path) -> OCRResult: if file_path.suffix.lower() not in {'.jpg', '.jpeg', '.png', '.heic', '.webp'}: return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None) try: image = Image.open(file_path) image = ImageOps.exif_transpose(image) text = pytesseract.image_to_string(image, lang='eng') except Exception: return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None) lines = [line.strip() for line in text.splitlines() if line.strip()] vendor = lines[0][:255] if lines else '' amount = None for pattern in AMOUNT_REGEXES: match = pattern.search(text) if match: amount = match.group(1).replace(',', '.') break date_match = DATE_REGEX.search(text) nip_match = NIP_REGEX.search(text) return OCRResult( status='review' if amount or vendor else 'pending', title=vendor or file_path.stem, vendor=vendor, amount=amount, purchase_date=date_match.group(1) if date_match else None, tax_id=nip_match.group(1).strip() if nip_match else None, raw_text=text[:4000], )