"""OCR via Apache Tika.""" import logging import httpx from app.config import TIKA_URL logger = logging.getLogger(__name__) async def extract_text(file_content: bytes, filename: str = "receipt.jpg") -> str: """Extrait le texte d'une image via Tika OCR.""" async with httpx.AsyncClient(timeout=30) as client: r = await client.put( f"{TIKA_URL}/tika", content=file_content, headers={"Content-Type": "application/octet-stream", "X-Tika-OCRskipOcr": "false"}, ) r.raise_for_status() text = r.text.strip() logger.info(f"OCR extrait {len(text)} caractères de {filename}") return text def parse_receipt_ocr(text: str) -> dict: """Parse le texte OCR d'un reçu AGILIS pour extraire les champs clés.""" import re result = { "رقم_الايصال": "", "التاريخ": "", "المحطة": "", "المنتج": "", "الكمية": 0.0, "السعر": 0.0, "القيمة": 0.0, } for line in text.split("\n"): line = line.strip() if not line: continue # Recherche du numéro de reçu (souvent formaté comme 123456-789) if m := re.search(r"(\d{4,6}[\s\-/]\d{3,6})", line): result["رقم_الايصال"] = m.group(1).replace(" ", "-") # Date marocaine (JJ/MM/AAAA) if m := re.search(r"(\d{2}/\d{2}/\d{4})", line): result["التاريخ"] = m.group(1) # Quantité en litres if m := re.search(r"(\d+[.,]\d+)\s*(?:L|ltrs?|litres?)", line): result["الكمية"] = float(m.group(1).replace(",", ".")) # Prix (Dhs ou DT) if m := re.search(r"(\d+[.,]\d{2})\s*(?:DHS?|DT|د\.م|DH)", line): val = float(m.group(1).replace(",", ".")) # On essaie de distinguer prix unitaire vs total if val < 30: result["السعر"] = val else: result["القيمة"] = val # Station if any(w in line.upper() for w in ["STATION", "AFRIQUIA", "TOTAL", "SHELL", "WIN", "OIL", "PETRO"]): result["المحطة"] = line return result