53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import pytesseract
|
|
from PIL import Image, ImageOps
|
|
|
|
AMOUNT_REGEXES = [
|
|
re.compile(r'(?:suma|total|razem)\s*[:]?\s*(\d+[\.,]\d{2})', re.I),
|
|
re.compile(r'(\d+[\.,]\d{2})'),
|
|
]
|
|
DATE_REGEX = re.compile(r'(\d{4}-\d{2}-\d{2}|\d{2}[./-]\d{2}[./-]\d{4})')
|
|
NIP_REGEX = re.compile(r'(?:NIP|TIN)\s*[:]?\s*([0-9\- ]{8,20})', re.I)
|
|
|
|
|
|
class OCRResult(dict):
|
|
@property
|
|
def status(self) -> str:
|
|
return self.get('status', 'pending')
|
|
|
|
|
|
class OCRService:
|
|
def extract(self, file_path: Path) -> OCRResult:
|
|
if file_path.suffix.lower() not in {'.jpg', '.jpeg', '.png', '.heic', '.webp'}:
|
|
return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None)
|
|
try:
|
|
image = Image.open(file_path)
|
|
image = ImageOps.exif_transpose(image)
|
|
text = pytesseract.image_to_string(image, lang='eng')
|
|
except Exception:
|
|
return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None)
|
|
|
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
vendor = lines[0][:255] if lines else ''
|
|
amount = None
|
|
for pattern in AMOUNT_REGEXES:
|
|
match = pattern.search(text)
|
|
if match:
|
|
amount = match.group(1).replace(',', '.')
|
|
break
|
|
date_match = DATE_REGEX.search(text)
|
|
nip_match = NIP_REGEX.search(text)
|
|
return OCRResult(
|
|
status='review' if amount or vendor else 'pending',
|
|
title=vendor or file_path.stem,
|
|
vendor=vendor,
|
|
amount=amount,
|
|
purchase_date=date_match.group(1) if date_match else None,
|
|
tax_id=nip_match.group(1).strip() if nip_match else None,
|
|
raw_text=text[:4000],
|
|
)
|