feat: offline OCR (Tesseract) + embedding category classifier (@xenova/transformers)

Tesseract OCR (PHP, server-side): - Dockerfile: adds tesseract-ocr + tesseract-ocr-ita + libgd-dev (gd extension) - api/index.php: new tesseractReadExpiry() — decodes base64 image, pre-processes with GD (2× upscale, greyscale, auto-contrast, sharpen), runs tesseract CLI with ita+eng PSM-6, extracts date with multi-pattern regex (DD/MM/YYYY, MM/YYYY, ISO, named-month), returns YYYY-MM-DD + confidence - geminiReadExpiry() now: (1) tries Tesseract first; (2) falls back to Gemini Vision if OCR returns null or no date found; (3) passes source ('ocr'|'gemini') in response @xenova/transformers embedding classifier (browser-side): - index.html: ES-module bootstrap that lazy-loads 'Xenova/all-MiniLM-L6-v2' quantized (~23 MB, cached in browser) via window._getCategoryPipeline(); pre-warms on first scan page visit - assets/js/app.js: classifyCategoryByEmbedding(name) — embeds product name + 16 category anchor descriptions, cosine similarity, threshold 0.30; results cached in _embeddingCache Map - autoDetectCategory(): after keyword map misses, fires classifyCategoryByEmbedding async and updates select when resolved (respects manuallySet flag) - createQuickProduct(): if regex returned 'altro', silently patches category with embedding result via a background api call
2026-05-03 13:17:14 +00:00
parent c814d99d1f
commit a6c2fb93cf
4 changed files with 363 additions and 13 deletions
@@ -1,11 +1,15 @@
 FROM php:8.2-apache

-# Install required PHP extensions
+# Install required PHP extensions + Tesseract OCR for offline expiry date reading
 RUN apt-get update && apt-get install -y \
    libsqlite3-dev \
    libcurl4-openssl-dev \
    libonig-dev \
-    && docker-php-ext-install pdo_sqlite curl mbstring \
+    libgd-dev \
+    tesseract-ocr \
+    tesseract-ocr-ita \
+    tesseract-ocr-eng \
+    && docker-php-ext-install pdo_sqlite curl mbstring gd \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

 # Enable Apache mod_rewrite and mod_headers
@@ -2243,21 +2243,194 @@ function getOpenedShelfLifeAction(): void {
    echo json_encode(['days' => $days]);
 }

-function geminiReadExpiry(): void {
-    $apiKey = env('GEMINI_API_KEY');
-    if (empty($apiKey)) {
-        echo json_encode(['success' => false, 'error' => 'no_api_key']);
-        return;
+// ===== TESSERACT OFFLINE OCR HELPER =====
+
+/**
+ * Try to extract an expiry date from a base64 image using Tesseract OCR (offline).
+ * Returns ['found'=>true,'date'=>'YYYY-MM-DD','raw_text'=>'...','confidence'=>float]
+ * or      ['found'=>false,'raw_text'=>'...']
+ *
+ * Strategy:
+ *  1. Decode base64 → temp JPEG
+ *  2. Pre-process with GD: desaturate, auto-contrast, sharpen, 2× upscale
+ *  3. Run tesseract with Italian+English langs, PSM-6 (block of text)
+ *  4. Run date-format regexes (Italian & international patterns)
+ *  5. Normalise to YYYY-MM-DD
+ *
+ * Returns null if tesseract binary is not available or GD is not compiled in.
+ */
+function tesseractReadExpiry(string $imageBase64): ?array {
+    // Require both the binary and the GD extension
+    if (!function_exists('imagecreatefromstring')) return null;
+    $tesseract = trim(shell_exec('which tesseract 2>/dev/null') ?? '');
+    if (empty($tesseract)) return null;
+
+    // ── 1. Decode image ────────────────────────────────────────────────────
+    $imgData = base64_decode($imageBase64);
+    if ($imgData === false || strlen($imgData) < 100) return null;
+
+    $src = @imagecreatefromstring($imgData);
+    if (!$src) return null;
+
+    $w = imagesx($src);
+    $h = imagesy($src);
+
+    // ── 2. Pre-process ─────────────────────────────────────────────────────
+    // 2a. Upscale ×2 – Tesseract performs best on ≥300 DPI; packaging photos
+    //     are often low-res so doubling helps character recognition.
+    $w2 = $w * 2;
+    $h2 = $h * 2;
+    $dst = imagecreatetruecolor($w2, $h2);
+    imagecopyresampled($dst, $src, 0, 0, 0, 0, $w2, $h2, $w, $h);
+    imagedestroy($src);
+
+    // 2b. Greyscale + auto-contrast
+    imagefilter($dst, IMG_FILTER_GRAYSCALE);
+    imagefilter($dst, IMG_FILTER_CONTRAST, -40); // negative = increase contrast in GD
+
+    // 2c. Sharpen (convolution kernel)
+    $kernel = [[0,-1,0],[-1,5,-1],[0,-1,0]];
+    imageconvolution($dst, $kernel, 1, 0);
+
+    // ── 3. Write temp file & run Tesseract ────────────────────────────────
+    $tmpIn  = sys_get_temp_dir() . '/ocr_in_'  . uniqid() . '.png';
+    $tmpOut = sys_get_temp_dir() . '/ocr_out_' . uniqid();
+    imagepng($dst, $tmpIn);
+    imagedestroy($dst);
+
+    // PSM 6 = assume a single uniform block of text (good for cropped label areas)
+    $cmd = escapeshellcmd($tesseract)
+         . ' ' . escapeshellarg($tmpIn)
+         . ' ' . escapeshellarg($tmpOut)
+         . ' -l ita+eng --psm 6 --oem 1'
+         . ' quiet 2>/dev/null';
+    shell_exec($cmd);
+
+    $rawText = '';
+    if (file_exists($tmpOut . '.txt')) {
+        $rawText = trim(file_get_contents($tmpOut . '.txt'));
+        unlink($tmpOut . '.txt');
    }
-    
+    if (file_exists($tmpIn)) unlink($tmpIn);
+
+    if (empty($rawText)) return ['found' => false, 'raw_text' => ''];
+
+    // ── 4. Parse date patterns ─────────────────────────────────────────────
+    $today = new DateTime();
+    $currentYear = (int)$today->format('Y');
+
+    // Normalise confusable OCR chars: O→0, I/l→1, S→5
+    $clean = preg_replace('/\bO\b/', '0', $rawText);
+    $clean = preg_replace('/[Il](?=\d)/', '1', $clean);
+
+    $patterns = [
+        // DD/MM/YYYY or DD-MM-YYYY or DD.MM.YYYY
+        '/\b(\d{1,2})[\/\-\.](\d{1,2})[\/\-\.](\d{4})\b/',
+        // MM/YYYY or MM-YYYY (best-before month/year only)
+        '/\b(\d{1,2})[\/\-\.](\d{4})\b/',
+        // YYYY-MM-DD (ISO)
+        '/\b(\d{4})-(\d{2})-(\d{2})\b/',
+        // DD MMM YYYY  (e.g. 15 APR 2026)
+        '/\b(\d{1,2})\s+(gen|feb|mar|apr|mag|giu|lug|ago|set|ott|nov|dic|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\.?\s*(\d{4})\b/i',
+        // MMM YYYY  (e.g. APR 2026)
+        '/\b(gen|feb|mar|apr|mag|giu|lug|ago|set|ott|nov|dic|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\.?\s*(\d{4})\b/i',
+    ];
+
+    $monthMap = [
+        'gen'=>1,'jan'=>1,'feb'=>2,'mar'=>3,'apr'=>4,'mag'=>5,'may'=>5,
+        'giu'=>6,'jun'=>6,'lug'=>7,'jul'=>7,'ago'=>8,'aug'=>8,
+        'set'=>9,'sep'=>9,'ott'=>10,'oct'=>10,'nov'=>11,'dic'=>12,'dec'=>12,
+    ];
+
+    $candidates = [];
+    foreach ($patterns as $pat) {
+        if (!preg_match_all($pat, $clean, $m, PREG_SET_ORDER)) continue;
+        foreach ($m as $match) {
+            $full = $match[0];
+            // Determine Y/M/D from which pattern matched
+            if (preg_match('/^\d{4}-\d{2}-\d{2}$/', $full)) {
+                // ISO
+                $y = (int)$match[1]; $mo = (int)$match[2]; $d = (int)$match[3];
+            } elseif (isset($monthMap[strtolower($match[2] ?? '')])) {
+                // DD MMM YYYY
+                $d  = (int)$match[1];
+                $mo = $monthMap[strtolower($match[2])];
+                $y  = (int)$match[3];
+            } elseif (isset($monthMap[strtolower($match[1] ?? '')])) {
+                // MMM YYYY
+                $d  = 1;
+                $mo = $monthMap[strtolower($match[1])];
+                $y  = (int)$match[2];
+            } elseif (count($match) === 3) {
+                // MM/YYYY
+                $mo = (int)$match[1]; $y = (int)$match[2]; $d = 1;
+            } else {
+                // DD/MM/YYYY
+                $d = (int)$match[1]; $mo = (int)$match[2]; $y = (int)$match[3];
+            }
+            // Sanity
+            if ($y < 2020 || $y > 2040) continue;
+            if ($mo < 1 || $mo > 12) continue;
+            if ($d < 1 || $d > 31) continue;
+            $dateStr = sprintf('%04d-%02d-%02d', $y, $mo, $d);
+            // Prefer dates in the future or near past (within 2 years)
+            $dt   = new DateTime($dateStr);
+            $diff = (int)$today->diff($dt)->days * ($dt >= $today ? 1 : -1);
+            $candidates[] = ['date' => $dateStr, 'score' => $diff, 'raw' => $full];
+        }
+    }
+
+    if (empty($candidates)) {
+        return ['found' => false, 'raw_text' => $rawText];
+    }
+
+    // Pick candidate closest to today (but prefer future dates, then near-past)
+    usort($candidates, fn($a, $b) => abs($a['score']) - abs($b['score']));
+    $best = $candidates[0];
+
+    return [
+        'found'      => true,
+        'date'       => $best['date'],
+        'raw_text'   => $rawText,
+        'raw_match'  => $best['raw'],
+        'confidence' => count($candidates) === 1 ? 0.9 : 0.75,
+        'source'     => 'tesseract',
+    ];
+}
+
+function geminiReadExpiry(): void {
    $input = json_decode(file_get_contents('php://input'), true);
    $imageBase64 = $input['image'] ?? '';
-    
+
    if (empty($imageBase64)) {
        echo json_encode(['success' => false, 'error' => 'No image provided']);
        return;
    }
-    
+
+    // ── Step 1: Try Tesseract offline OCR first ────────────────────────────
+    $ocrResult = tesseractReadExpiry($imageBase64);
+    if ($ocrResult !== null && !empty($ocrResult['found']) && !empty($ocrResult['date'])) {
+        echo json_encode([
+            'success'     => true,
+            'expiry_date' => $ocrResult['date'],
+            'raw_text'    => $ocrResult['raw_text'] ?? '',
+            'source'      => 'ocr',
+        ]);
+        return;
+    }
+
+    // ── Step 2: Fall back to Gemini Vision ────────────────────────────────
+    $apiKey = env('GEMINI_API_KEY');
+    if (empty($apiKey)) {
+        // No Gemini key and OCR failed/unavailable
+        echo json_encode([
+            'success'  => false,
+            'error'    => 'no_api_key',
+            'raw_text' => $ocrResult['raw_text'] ?? '',
+        ]);
+        return;
+    }
+
    // Call Gemini API
    $payload = [
        'contents' => [
@@ -2305,7 +2478,7 @@ function geminiReadExpiry(): void {
        // Validate date format
        $date = $parsed['date'];
        if (preg_match('/^\d{4}-\d{2}-\d{2}$/', $date)) {
-            echo json_encode(['success' => true, 'expiry_date' => $date, 'raw_text' => $parsed['raw_text'] ?? '']);
+            echo json_encode(['success' => true, 'expiry_date' => $date, 'raw_text' => $parsed['raw_text'] ?? '', 'source' => 'gemini']);
            return;
        }
    }
@@ -1086,6 +1086,106 @@ function guessCategoryFromName(name) {
    return 'altro';
 }

+// ─────────────────────────────────────────────────────────────────────────────
+// Embedding-based category classifier (async, @xenova/transformers)
+// ─────────────────────────────────────────────────────────────────────────────
+
+// Canonical descriptions for each local category (used as embedding anchors).
+const _CATEGORY_DESCRIPTIONS = {
+    latticini:  'latte yogurt formaggio burro panna mozzarella latticini dairy',
+    carne:      'carne pollo manzo maiale vitello prosciutto salame bresaola meat',
+    pesce:      'pesce tonno salmone merluzzo gamberi seafood fish',
+    frutta:     'frutta mela banana arancia pera fragola uva kiwi fruit',
+    verdura:    'verdura insalata zucchina carota cipolla spinaci tomato vegetables',
+    pasta:      'pasta spaghetti penne fusilli riso risotto noodles rice',
+    pane:       'pane fette biscottate grissini cracker toast bread bakery',
+    surgelati:  'surgelati congelato frozen gelato ice cream',
+    bevande:    'acqua birra vino succo caffè tè bevande drinks beverages',
+    condimenti: 'olio aceto sale zucchero farina ketchup maionese senape spezie condiments',
+    snack:      'biscotti cioccolato patatine snack caramelle wafer merendine',
+    conserve:   'conserve pelati passata marmellata miele legumi ceci beans canned',
+    cereali:    'cereali muesli granola fiocchi d\'avena oat breakfast cereal',
+    igiene:     'sapone shampoo dentifricio deodorante igiene personale hygiene',
+    pulizia:    'detersivo detergente pulizia casa sgrassatore cleaning',
+    altro:      'prodotto generico varie altro miscellaneous',
+};
+
+// In-memory cache: productName → category (avoids re-embedding the same product)
+const _embeddingCache = new Map();
+
+/**
+ * Cosine similarity between two Float32Array vectors.
+ */
+function _cosineSim(a, b) {
+    let dot = 0, na = 0, nb = 0;
+    for (let i = 0; i < a.length; i++) {
+        dot += a[i] * b[i];
+        na  += a[i] * a[i];
+        nb  += b[i] * b[i];
+    }
+    return dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-9);
+}
+
+/**
+ * Mean-pool a [1, tokens, dims] tensor → Float32Array of length dims.
+ */
+function _meanPool(tensor) {
+    const [, tokens, dims] = tensor.dims;
+    const data = tensor.data;
+    const out  = new Float32Array(dims);
+    for (let t = 0; t < tokens; t++) {
+        for (let d = 0; d < dims; d++) {
+            out[d] += data[t * dims + d];
+        }
+    }
+    for (let d = 0; d < dims; d++) out[d] /= tokens;
+    return out;
+}
+
+/**
+ * Async: returns the best-matching category key for `productName`.
+ * Returns null if the model is unavailable or similarity is too low.
+ * THRESHOLD 0.30 — below this the regex fallback is more reliable.
+ */
+async function classifyCategoryByEmbedding(productName) {
+    if (!productName) return null;
+    const key = productName.toLowerCase().trim();
+    if (_embeddingCache.has(key)) return _embeddingCache.get(key);
+
+    if (typeof window._getCategoryPipeline !== 'function') return null;
+    const pipe = await window._getCategoryPipeline();
+    if (!pipe) return null;
+
+    try {
+        const labels = Object.keys(_CATEGORY_DESCRIPTIONS);
+        const texts  = [key, ...labels.map(l => _CATEGORY_DESCRIPTIONS[l])];
+
+        // Embed all texts in one batched call for efficiency
+        const output  = await pipe(texts, { pooling: 'mean', normalize: true });
+        const vectors = labels.map((_, i) => {
+            const t = output[i + 1];
+            // output[i] may be a Tensor or already a plain array-like
+            return t.dims ? _meanPool(t) : new Float32Array(t.data ?? t);
+        });
+        const queryVec = output[0].dims
+            ? _meanPool(output[0])
+            : new Float32Array(output[0].data ?? output[0]);
+
+        let bestLabel = null, bestSim = 0;
+        for (let i = 0; i < labels.length; i++) {
+            const sim = _cosineSim(queryVec, vectors[i]);
+            if (sim > bestSim) { bestSim = sim; bestLabel = labels[i]; }
+        }
+
+        const result = (bestSim >= 0.30 && bestLabel !== 'altro') ? bestLabel : null;
+        _embeddingCache.set(key, result);
+        return result;
+    } catch (e) {
+        console.warn('[EverShelf] Embedding classify error:', e);
+        return null;
+    }
+}
+
 // Determine safety level for expired products
 // Returns { level: 'danger'|'warning'|'ok', icon, label, tip }
 function getExpiredSafety(item, daysExpired) {
@@ -2024,7 +2124,12 @@ function showPage(pageId, param = null) {
            }
            loadInventory();
            break;
-        case 'scan': initScanner(); clearQuickNameResults(); updateSpesaBanner(); break;
+        case 'scan': initScanner(); clearQuickNameResults(); updateSpesaBanner();
+            // Pre-warm the embedding model the first time user visits scan page
+            if (typeof window._getCategoryPipeline === 'function' && !window._categoryPipelineReady) {
+                window._getCategoryPipeline(); // fire-and-forget
+            }
+            break;
        case 'products': loadAllProducts(); break;
        case 'shopping': loadShoppingList(); break;
        case 'recipe': loadRecipeArchive(); break;
@@ -4470,7 +4575,7 @@ function selectQuickProduct(product) {
 async function createQuickProduct(name) {
    showLoading(true);
    
-    // Auto-detect category from name
+    // Auto-detect category from name (sync regex first)
    const category = guessCategoryFromName(name);
    
    try {
@@ -4494,6 +4599,27 @@ async function createQuickProduct(name) {
            showLoading(false);
            clearQuickNameResults();
            showToast('Prodotto creato!', 'success');
+
+            // If regex gave 'altro', try embedding in background and silently update
+            if (category === 'altro' && typeof classifyCategoryByEmbedding === 'function') {
+                classifyCategoryByEmbedding(name).then(async embCat => {
+                    if (!embCat || !result.id) return;
+                    try {
+                        await api('product_save', {}, 'POST', {
+                            id: result.id,
+                            name: name,
+                            brand: '',
+                            category: embCat,
+                            unit: 'pz',
+                            default_quantity: 1,
+                        });
+                        if (currentProduct && currentProduct.id === result.id) {
+                            currentProduct.category = embCat;
+                        }
+                    } catch (_) { /* silent */ }
+                });
+            }
+
            showProductAction();
        } else {
            showLoading(false);
@@ -4614,6 +4740,20 @@ function autoDetectCategory() {
            return;
        }
    }
+
+    // ── Embedding fallback: async, only when keywords didn't match ──────────
+    // Kick off model load (no-op if already loaded/loading) and update the
+    // select once the result is ready.  Only runs when pipeline is available.
+    if (typeof classifyCategoryByEmbedding === 'function') {
+        classifyCategoryByEmbedding(document.getElementById('pf-name').value).then(embCat => {
+            if (!embCat) return;
+            // Re-check manuallySet — user might have picked something while awaiting
+            const sel = document.getElementById('pf-category');
+            if (!sel || sel.dataset.manuallySet === 'true') return;
+            sel.value = embCat;
+            onCategoryChange(true);
+        });
+    }
 }

 function onCategoryChange(fromAutoDetect = false) {
@@ -14,6 +14,39 @@
    <link rel="stylesheet" href="assets/css/style.css?v=20260421a">
    <!-- QuaggaJS for barcode scanning -->
    <script src="https://cdn.jsdelivr.net/npm/@ericblade/quagga2@1.8.4/dist/quagga.min.js"></script>
+    <!-- @xenova/transformers: ES-module bootstrap that exposes a lazy category-classifier as window._categoryPipelinePromise -->
+    <script type="module">
+        // Lazy-load the embedding pipeline only when first needed.
+        // Using a dynamic import so the ~2 MB WASM is not fetched on page load.
+        window._categoryPipelineReady = false;
+        window._categoryPipelinePromise = null;
+
+        window._getCategoryPipeline = async function() {
+            if (window._categoryPipelinePromise) return window._categoryPipelinePromise;
+            window._categoryPipelinePromise = (async () => {
+                try {
+                    const { pipeline, env } = await import(
+                        'https://cdn.jsdelivr.net/npm/@xenova/transformers@2/src/transformers.min.js'
+                    );
+                    // Keep WASM/model files in the browser cache; disable remote model check
+                    // to avoid CORS issues with the self-hosted instance.
+                    env.allowRemoteModels = true;
+                    env.useBrowserCache   = true;
+                    const pipe = await pipeline(
+                        'feature-extraction',
+                        'Xenova/all-MiniLM-L6-v2',
+                        { quantized: true }
+                    );
+                    window._categoryPipelineReady = true;
+                    return pipe;
+                } catch (e) {
+                    console.warn('[EverShelf] Embedding model unavailable, regex fallback only:', e);
+                    return null;
+                }
+            })();
+            return window._categoryPipelinePromise;
+        };
+    </script>
 </head>
 <body>