67 lines
2.2 KiB
Python
67 lines
2.2 KiB
Python
"""OCR via Apache Tika."""
|
||
import logging
|
||
import httpx
|
||
from app.config import TIKA_URL
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
async def extract_text(file_content: bytes, filename: str = "receipt.jpg") -> str:
|
||
"""Extrait le texte d'une image via Tika OCR."""
|
||
async with httpx.AsyncClient(timeout=30) as client:
|
||
r = await client.put(
|
||
f"{TIKA_URL}/tika",
|
||
content=file_content,
|
||
headers={"Content-Type": "application/octet-stream", "X-Tika-OCRskipOcr": "false"},
|
||
)
|
||
r.raise_for_status()
|
||
text = r.text.strip()
|
||
logger.info(f"OCR extrait {len(text)} caractères de {filename}")
|
||
return text
|
||
|
||
|
||
def parse_receipt_ocr(text: str) -> dict:
|
||
"""Parse le texte OCR d'un reçu AGILIS pour extraire les champs clés."""
|
||
import re
|
||
|
||
result = {
|
||
"رقم_الايصال": "",
|
||
"التاريخ": "",
|
||
"المحطة": "",
|
||
"المنتج": "",
|
||
"الكمية": 0.0,
|
||
"السعر": 0.0,
|
||
"القيمة": 0.0,
|
||
}
|
||
|
||
for line in text.split("\n"):
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
|
||
# Recherche du numéro de reçu (souvent formaté comme 123456-789)
|
||
if m := re.search(r"(\d{4,6}[\s\-/]\d{3,6})", line):
|
||
result["رقم_الايصال"] = m.group(1).replace(" ", "-")
|
||
|
||
# Date marocaine (JJ/MM/AAAA)
|
||
if m := re.search(r"(\d{2}/\d{2}/\d{4})", line):
|
||
result["التاريخ"] = m.group(1)
|
||
|
||
# Quantité en litres
|
||
if m := re.search(r"(\d+[.,]\d+)\s*(?:L|ltrs?|litres?)", line):
|
||
result["الكمية"] = float(m.group(1).replace(",", "."))
|
||
|
||
# Prix (Dhs ou DT)
|
||
if m := re.search(r"(\d+[.,]\d{2})\s*(?:DHS?|DT|د\.م|DH)", line):
|
||
val = float(m.group(1).replace(",", "."))
|
||
# On essaie de distinguer prix unitaire vs total
|
||
if val < 30:
|
||
result["السعر"] = val
|
||
else:
|
||
result["القيمة"] = val
|
||
|
||
# Station
|
||
if any(w in line.upper() for w in ["STATION", "AFRIQUIA", "TOTAL", "SHELL", "WIN", "OIL", "PETRO"]):
|
||
result["المحطة"] = line
|
||
|
||
return result |