gsparc-mezzouna-api/app/ocr.py

67 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""OCR via Apache Tika."""
import logging
import httpx
from app.config import TIKA_URL
logger = logging.getLogger(__name__)
async def extract_text(file_content: bytes, filename: str = "receipt.jpg") -> str:
"""Extrait le texte d'une image via Tika OCR."""
async with httpx.AsyncClient(timeout=30) as client:
r = await client.put(
f"{TIKA_URL}/tika",
content=file_content,
headers={"Content-Type": "application/octet-stream", "X-Tika-OCRskipOcr": "false"},
)
r.raise_for_status()
text = r.text.strip()
logger.info(f"OCR extrait {len(text)} caractères de {filename}")
return text
def parse_receipt_ocr(text: str) -> dict:
"""Parse le texte OCR d'un reçu AGILIS pour extraire les champs clés."""
import re
result = {
"رقم_الايصال": "",
"التاريخ": "",
"المحطة": "",
"المنتج": "",
"الكمية": 0.0,
"السعر": 0.0,
"القيمة": 0.0,
}
for line in text.split("\n"):
line = line.strip()
if not line:
continue
# Recherche du numéro de reçu (souvent formaté comme 123456-789)
if m := re.search(r"(\d{4,6}[\s\-/]\d{3,6})", line):
result["رقم_الايصال"] = m.group(1).replace(" ", "-")
# Date marocaine (JJ/MM/AAAA)
if m := re.search(r"(\d{2}/\d{2}/\d{4})", line):
result["التاريخ"] = m.group(1)
# Quantité en litres
if m := re.search(r"(\d+[.,]\d+)\s*(?:L|ltrs?|litres?)", line):
result["الكمية"] = float(m.group(1).replace(",", "."))
# Prix (Dhs ou DT)
if m := re.search(r"(\d+[.,]\d{2})\s*(?:DHS?|DT|د\.م|DH)", line):
val = float(m.group(1).replace(",", "."))
# On essaie de distinguer prix unitaire vs total
if val < 30:
result["السعر"] = val
else:
result["القيمة"] = val
# Station
if any(w in line.upper() for w in ["STATION", "AFRIQUIA", "TOTAL", "SHELL", "WIN", "OIL", "PETRO"]):
result["المحطة"] = line
return result