Files
expense_monitor/app/services/ocr.py
Mateusz Gruszczyński 986ffb200a first commit
2026-03-13 15:17:32 +01:00

53 lines
1.8 KiB
Python

from __future__ import annotations
import re
from pathlib import Path
import pytesseract
from PIL import Image, ImageOps
AMOUNT_REGEXES = [
re.compile(r'(?:suma|total|razem)\s*[:]?\s*(\d+[\.,]\d{2})', re.I),
re.compile(r'(\d+[\.,]\d{2})'),
]
DATE_REGEX = re.compile(r'(\d{4}-\d{2}-\d{2}|\d{2}[./-]\d{2}[./-]\d{4})')
NIP_REGEX = re.compile(r'(?:NIP|TIN)\s*[:]?\s*([0-9\- ]{8,20})', re.I)
class OCRResult(dict):
@property
def status(self) -> str:
return self.get('status', 'pending')
class OCRService:
def extract(self, file_path: Path) -> OCRResult:
if file_path.suffix.lower() not in {'.jpg', '.jpeg', '.png', '.heic', '.webp'}:
return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None)
try:
image = Image.open(file_path)
image = ImageOps.exif_transpose(image)
text = pytesseract.image_to_string(image, lang='eng')
except Exception:
return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None)
lines = [line.strip() for line in text.splitlines() if line.strip()]
vendor = lines[0][:255] if lines else ''
amount = None
for pattern in AMOUNT_REGEXES:
match = pattern.search(text)
if match:
amount = match.group(1).replace(',', '.')
break
date_match = DATE_REGEX.search(text)
nip_match = NIP_REGEX.search(text)
return OCRResult(
status='review' if amount or vendor else 'pending',
title=vendor or file_path.stem,
vendor=vendor,
amount=amount,
purchase_date=date_match.group(1) if date_match else None,
tax_id=nip_match.group(1).strip() if nip_match else None,
raw_text=text[:4000],
)