first commit
This commit is contained in:
52
app/services/ocr.py
Normal file
52
app/services/ocr.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image, ImageOps
|
||||
|
||||
AMOUNT_REGEXES = [
|
||||
re.compile(r'(?:suma|total|razem)\s*[:]?\s*(\d+[\.,]\d{2})', re.I),
|
||||
re.compile(r'(\d+[\.,]\d{2})'),
|
||||
]
|
||||
DATE_REGEX = re.compile(r'(\d{4}-\d{2}-\d{2}|\d{2}[./-]\d{2}[./-]\d{4})')
|
||||
NIP_REGEX = re.compile(r'(?:NIP|TIN)\s*[:]?\s*([0-9\- ]{8,20})', re.I)
|
||||
|
||||
|
||||
class OCRResult(dict):
|
||||
@property
|
||||
def status(self) -> str:
|
||||
return self.get('status', 'pending')
|
||||
|
||||
|
||||
class OCRService:
|
||||
def extract(self, file_path: Path) -> OCRResult:
|
||||
if file_path.suffix.lower() not in {'.jpg', '.jpeg', '.png', '.heic', '.webp'}:
|
||||
return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None)
|
||||
try:
|
||||
image = Image.open(file_path)
|
||||
image = ImageOps.exif_transpose(image)
|
||||
text = pytesseract.image_to_string(image, lang='eng')
|
||||
except Exception:
|
||||
return OCRResult(status='pending', title=file_path.stem, vendor='', amount=None, purchase_date=None)
|
||||
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
vendor = lines[0][:255] if lines else ''
|
||||
amount = None
|
||||
for pattern in AMOUNT_REGEXES:
|
||||
match = pattern.search(text)
|
||||
if match:
|
||||
amount = match.group(1).replace(',', '.')
|
||||
break
|
||||
date_match = DATE_REGEX.search(text)
|
||||
nip_match = NIP_REGEX.search(text)
|
||||
return OCRResult(
|
||||
status='review' if amount or vendor else 'pending',
|
||||
title=vendor or file_path.stem,
|
||||
vendor=vendor,
|
||||
amount=amount,
|
||||
purchase_date=date_match.group(1) if date_match else None,
|
||||
tax_id=nip_match.group(1).strip() if nip_match else None,
|
||||
raw_text=text[:4000],
|
||||
)
|
||||
Reference in New Issue
Block a user