diff --git a/Dockerfile b/Dockerfile
index 5fa13f2..68f7d39 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,11 +1,15 @@
FROM php:8.2-apache
-# Install required PHP extensions
+# Install required PHP extensions + Tesseract OCR for offline expiry date reading
RUN apt-get update && apt-get install -y \
libsqlite3-dev \
libcurl4-openssl-dev \
libonig-dev \
- && docker-php-ext-install pdo_sqlite curl mbstring \
+ libgd-dev \
+ tesseract-ocr \
+ tesseract-ocr-ita \
+ tesseract-ocr-eng \
+ && docker-php-ext-install pdo_sqlite curl mbstring gd \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
# Enable Apache mod_rewrite and mod_headers
diff --git a/api/index.php b/api/index.php
index 315a05f..2f9a47e 100644
--- a/api/index.php
+++ b/api/index.php
@@ -2243,21 +2243,194 @@ function getOpenedShelfLifeAction(): void {
echo json_encode(['days' => $days]);
}
-function geminiReadExpiry(): void {
- $apiKey = env('GEMINI_API_KEY');
- if (empty($apiKey)) {
- echo json_encode(['success' => false, 'error' => 'no_api_key']);
- return;
+// ===== TESSERACT OFFLINE OCR HELPER =====
+
+/**
+ * Try to extract an expiry date from a base64 image using Tesseract OCR (offline).
+ * Returns ['found'=>true,'date'=>'YYYY-MM-DD','raw_text'=>'...','confidence'=>float]
+ * or ['found'=>false,'raw_text'=>'...']
+ *
+ * Strategy:
+ * 1. Decode base64 → temp JPEG
+ * 2. Pre-process with GD: desaturate, auto-contrast, sharpen, 2× upscale
+ * 3. Run tesseract with Italian+English langs, PSM-6 (block of text)
+ * 4. Run date-format regexes (Italian & international patterns)
+ * 5. Normalise to YYYY-MM-DD
+ *
+ * Returns null if tesseract binary is not available or GD is not compiled in.
+ */
+function tesseractReadExpiry(string $imageBase64): ?array {
+ // Require both the binary and the GD extension
+ if (!function_exists('imagecreatefromstring')) return null;
+ $tesseract = trim(shell_exec('which tesseract 2>/dev/null') ?? '');
+ if (empty($tesseract)) return null;
+
+ // ── 1. Decode image ────────────────────────────────────────────────────
+ $imgData = base64_decode($imageBase64);
+ if ($imgData === false || strlen($imgData) < 100) return null;
+
+ $src = @imagecreatefromstring($imgData);
+ if (!$src) return null;
+
+ $w = imagesx($src);
+ $h = imagesy($src);
+
+ // ── 2. Pre-process ─────────────────────────────────────────────────────
+ // 2a. Upscale ×2 – Tesseract performs best on ≥300 DPI; packaging photos
+ // are often low-res so doubling helps character recognition.
+ $w2 = $w * 2;
+ $h2 = $h * 2;
+ $dst = imagecreatetruecolor($w2, $h2);
+ imagecopyresampled($dst, $src, 0, 0, 0, 0, $w2, $h2, $w, $h);
+ imagedestroy($src);
+
+ // 2b. Greyscale + auto-contrast
+ imagefilter($dst, IMG_FILTER_GRAYSCALE);
+ imagefilter($dst, IMG_FILTER_CONTRAST, -40); // negative = increase contrast in GD
+
+ // 2c. Sharpen (convolution kernel)
+ $kernel = [[0,-1,0],[-1,5,-1],[0,-1,0]];
+ imageconvolution($dst, $kernel, 1, 0);
+
+ // ── 3. Write temp file & run Tesseract ────────────────────────────────
+ $tmpIn = sys_get_temp_dir() . '/ocr_in_' . uniqid() . '.png';
+ $tmpOut = sys_get_temp_dir() . '/ocr_out_' . uniqid();
+ imagepng($dst, $tmpIn);
+ imagedestroy($dst);
+
+ // PSM 6 = assume a single uniform block of text (good for cropped label areas)
+ $cmd = escapeshellcmd($tesseract)
+ . ' ' . escapeshellarg($tmpIn)
+ . ' ' . escapeshellarg($tmpOut)
+ . ' -l ita+eng --psm 6 --oem 1'
+ . ' quiet 2>/dev/null';
+ shell_exec($cmd);
+
+ $rawText = '';
+ if (file_exists($tmpOut . '.txt')) {
+ $rawText = trim(file_get_contents($tmpOut . '.txt'));
+ unlink($tmpOut . '.txt');
}
-
+ if (file_exists($tmpIn)) unlink($tmpIn);
+
+ if (empty($rawText)) return ['found' => false, 'raw_text' => ''];
+
+ // ── 4. Parse date patterns ─────────────────────────────────────────────
+ $today = new DateTime();
+ $currentYear = (int)$today->format('Y');
+
+ // Normalise confusable OCR chars: O→0, I/l→1, S→5
+ $clean = preg_replace('/\bO\b/', '0', $rawText);
+ $clean = preg_replace('/[Il](?=\d)/', '1', $clean);
+
+ $patterns = [
+ // DD/MM/YYYY or DD-MM-YYYY or DD.MM.YYYY
+ '/\b(\d{1,2})[\/\-\.](\d{1,2})[\/\-\.](\d{4})\b/',
+ // MM/YYYY or MM-YYYY (best-before month/year only)
+ '/\b(\d{1,2})[\/\-\.](\d{4})\b/',
+ // YYYY-MM-DD (ISO)
+ '/\b(\d{4})-(\d{2})-(\d{2})\b/',
+ // DD MMM YYYY (e.g. 15 APR 2026)
+ '/\b(\d{1,2})\s+(gen|feb|mar|apr|mag|giu|lug|ago|set|ott|nov|dic|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\.?\s*(\d{4})\b/i',
+ // MMM YYYY (e.g. APR 2026)
+ '/\b(gen|feb|mar|apr|mag|giu|lug|ago|set|ott|nov|dic|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\.?\s*(\d{4})\b/i',
+ ];
+
+ $monthMap = [
+ 'gen'=>1,'jan'=>1,'feb'=>2,'mar'=>3,'apr'=>4,'mag'=>5,'may'=>5,
+ 'giu'=>6,'jun'=>6,'lug'=>7,'jul'=>7,'ago'=>8,'aug'=>8,
+ 'set'=>9,'sep'=>9,'ott'=>10,'oct'=>10,'nov'=>11,'dic'=>12,'dec'=>12,
+ ];
+
+ $candidates = [];
+ foreach ($patterns as $pat) {
+ if (!preg_match_all($pat, $clean, $m, PREG_SET_ORDER)) continue;
+ foreach ($m as $match) {
+ $full = $match[0];
+ // Determine Y/M/D from which pattern matched
+ if (preg_match('/^\d{4}-\d{2}-\d{2}$/', $full)) {
+ // ISO
+ $y = (int)$match[1]; $mo = (int)$match[2]; $d = (int)$match[3];
+ } elseif (isset($monthMap[strtolower($match[2] ?? '')])) {
+ // DD MMM YYYY
+ $d = (int)$match[1];
+ $mo = $monthMap[strtolower($match[2])];
+ $y = (int)$match[3];
+ } elseif (isset($monthMap[strtolower($match[1] ?? '')])) {
+ // MMM YYYY
+ $d = 1;
+ $mo = $monthMap[strtolower($match[1])];
+ $y = (int)$match[2];
+ } elseif (count($match) === 3) {
+ // MM/YYYY
+ $mo = (int)$match[1]; $y = (int)$match[2]; $d = 1;
+ } else {
+ // DD/MM/YYYY
+ $d = (int)$match[1]; $mo = (int)$match[2]; $y = (int)$match[3];
+ }
+ // Sanity
+ if ($y < 2020 || $y > 2040) continue;
+ if ($mo < 1 || $mo > 12) continue;
+ if ($d < 1 || $d > 31) continue;
+ $dateStr = sprintf('%04d-%02d-%02d', $y, $mo, $d);
+ // Prefer dates in the future or near past (within 2 years)
+ $dt = new DateTime($dateStr);
+ $diff = (int)$today->diff($dt)->days * ($dt >= $today ? 1 : -1);
+ $candidates[] = ['date' => $dateStr, 'score' => $diff, 'raw' => $full];
+ }
+ }
+
+ if (empty($candidates)) {
+ return ['found' => false, 'raw_text' => $rawText];
+ }
+
+ // Pick candidate closest to today (but prefer future dates, then near-past)
+ usort($candidates, fn($a, $b) => abs($a['score']) - abs($b['score']));
+ $best = $candidates[0];
+
+ return [
+ 'found' => true,
+ 'date' => $best['date'],
+ 'raw_text' => $rawText,
+ 'raw_match' => $best['raw'],
+ 'confidence' => count($candidates) === 1 ? 0.9 : 0.75,
+ 'source' => 'tesseract',
+ ];
+}
+
+function geminiReadExpiry(): void {
$input = json_decode(file_get_contents('php://input'), true);
$imageBase64 = $input['image'] ?? '';
-
+
if (empty($imageBase64)) {
echo json_encode(['success' => false, 'error' => 'No image provided']);
return;
}
-
+
+ // ── Step 1: Try Tesseract offline OCR first ────────────────────────────
+ $ocrResult = tesseractReadExpiry($imageBase64);
+ if ($ocrResult !== null && !empty($ocrResult['found']) && !empty($ocrResult['date'])) {
+ echo json_encode([
+ 'success' => true,
+ 'expiry_date' => $ocrResult['date'],
+ 'raw_text' => $ocrResult['raw_text'] ?? '',
+ 'source' => 'ocr',
+ ]);
+ return;
+ }
+
+ // ── Step 2: Fall back to Gemini Vision ────────────────────────────────
+ $apiKey = env('GEMINI_API_KEY');
+ if (empty($apiKey)) {
+ // No Gemini key and OCR failed/unavailable
+ echo json_encode([
+ 'success' => false,
+ 'error' => 'no_api_key',
+ 'raw_text' => $ocrResult['raw_text'] ?? '',
+ ]);
+ return;
+ }
+
// Call Gemini API
$payload = [
'contents' => [
@@ -2305,7 +2478,7 @@ function geminiReadExpiry(): void {
// Validate date format
$date = $parsed['date'];
if (preg_match('/^\d{4}-\d{2}-\d{2}$/', $date)) {
- echo json_encode(['success' => true, 'expiry_date' => $date, 'raw_text' => $parsed['raw_text'] ?? '']);
+ echo json_encode(['success' => true, 'expiry_date' => $date, 'raw_text' => $parsed['raw_text'] ?? '', 'source' => 'gemini']);
return;
}
}
diff --git a/assets/js/app.js b/assets/js/app.js
index 2a840f4..8daeade 100644
--- a/assets/js/app.js
+++ b/assets/js/app.js
@@ -1086,6 +1086,106 @@ function guessCategoryFromName(name) {
return 'altro';
}
+// ─────────────────────────────────────────────────────────────────────────────
+// Embedding-based category classifier (async, @xenova/transformers)
+// ─────────────────────────────────────────────────────────────────────────────
+
+// Canonical descriptions for each local category (used as embedding anchors).
+const _CATEGORY_DESCRIPTIONS = {
+ latticini: 'latte yogurt formaggio burro panna mozzarella latticini dairy',
+ carne: 'carne pollo manzo maiale vitello prosciutto salame bresaola meat',
+ pesce: 'pesce tonno salmone merluzzo gamberi seafood fish',
+ frutta: 'frutta mela banana arancia pera fragola uva kiwi fruit',
+ verdura: 'verdura insalata zucchina carota cipolla spinaci tomato vegetables',
+ pasta: 'pasta spaghetti penne fusilli riso risotto noodles rice',
+ pane: 'pane fette biscottate grissini cracker toast bread bakery',
+ surgelati: 'surgelati congelato frozen gelato ice cream',
+ bevande: 'acqua birra vino succo caffè tè bevande drinks beverages',
+ condimenti: 'olio aceto sale zucchero farina ketchup maionese senape spezie condiments',
+ snack: 'biscotti cioccolato patatine snack caramelle wafer merendine',
+ conserve: 'conserve pelati passata marmellata miele legumi ceci beans canned',
+ cereali: 'cereali muesli granola fiocchi d\'avena oat breakfast cereal',
+ igiene: 'sapone shampoo dentifricio deodorante igiene personale hygiene',
+ pulizia: 'detersivo detergente pulizia casa sgrassatore cleaning',
+ altro: 'prodotto generico varie altro miscellaneous',
+};
+
+// In-memory cache: productName → category (avoids re-embedding the same product)
+const _embeddingCache = new Map();
+
+/**
+ * Cosine similarity between two Float32Array vectors.
+ */
+function _cosineSim(a, b) {
+ let dot = 0, na = 0, nb = 0;
+ for (let i = 0; i < a.length; i++) {
+ dot += a[i] * b[i];
+ na += a[i] * a[i];
+ nb += b[i] * b[i];
+ }
+ return dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-9);
+}
+
+/**
+ * Mean-pool a [1, tokens, dims] tensor → Float32Array of length dims.
+ */
+function _meanPool(tensor) {
+ const [, tokens, dims] = tensor.dims;
+ const data = tensor.data;
+ const out = new Float32Array(dims);
+ for (let t = 0; t < tokens; t++) {
+ for (let d = 0; d < dims; d++) {
+ out[d] += data[t * dims + d];
+ }
+ }
+ for (let d = 0; d < dims; d++) out[d] /= tokens;
+ return out;
+}
+
+/**
+ * Async: returns the best-matching category key for `productName`.
+ * Returns null if the model is unavailable or similarity is too low.
+ * THRESHOLD 0.30 — below this the regex fallback is more reliable.
+ */
+async function classifyCategoryByEmbedding(productName) {
+ if (!productName) return null;
+ const key = productName.toLowerCase().trim();
+ if (_embeddingCache.has(key)) return _embeddingCache.get(key);
+
+ if (typeof window._getCategoryPipeline !== 'function') return null;
+ const pipe = await window._getCategoryPipeline();
+ if (!pipe) return null;
+
+ try {
+ const labels = Object.keys(_CATEGORY_DESCRIPTIONS);
+ const texts = [key, ...labels.map(l => _CATEGORY_DESCRIPTIONS[l])];
+
+ // Embed all texts in one batched call for efficiency
+ const output = await pipe(texts, { pooling: 'mean', normalize: true });
+ const vectors = labels.map((_, i) => {
+ const t = output[i + 1];
+ // output[i] may be a Tensor or already a plain array-like
+ return t.dims ? _meanPool(t) : new Float32Array(t.data ?? t);
+ });
+ const queryVec = output[0].dims
+ ? _meanPool(output[0])
+ : new Float32Array(output[0].data ?? output[0]);
+
+ let bestLabel = null, bestSim = 0;
+ for (let i = 0; i < labels.length; i++) {
+ const sim = _cosineSim(queryVec, vectors[i]);
+ if (sim > bestSim) { bestSim = sim; bestLabel = labels[i]; }
+ }
+
+ const result = (bestSim >= 0.30 && bestLabel !== 'altro') ? bestLabel : null;
+ _embeddingCache.set(key, result);
+ return result;
+ } catch (e) {
+ console.warn('[EverShelf] Embedding classify error:', e);
+ return null;
+ }
+}
+
// Determine safety level for expired products
// Returns { level: 'danger'|'warning'|'ok', icon, label, tip }
function getExpiredSafety(item, daysExpired) {
@@ -2024,7 +2124,12 @@ function showPage(pageId, param = null) {
}
loadInventory();
break;
- case 'scan': initScanner(); clearQuickNameResults(); updateSpesaBanner(); break;
+ case 'scan': initScanner(); clearQuickNameResults(); updateSpesaBanner();
+ // Pre-warm the embedding model the first time user visits scan page
+ if (typeof window._getCategoryPipeline === 'function' && !window._categoryPipelineReady) {
+ window._getCategoryPipeline(); // fire-and-forget
+ }
+ break;
case 'products': loadAllProducts(); break;
case 'shopping': loadShoppingList(); break;
case 'recipe': loadRecipeArchive(); break;
@@ -4470,7 +4575,7 @@ function selectQuickProduct(product) {
async function createQuickProduct(name) {
showLoading(true);
- // Auto-detect category from name
+ // Auto-detect category from name (sync regex first)
const category = guessCategoryFromName(name);
try {
@@ -4494,6 +4599,27 @@ async function createQuickProduct(name) {
showLoading(false);
clearQuickNameResults();
showToast('Prodotto creato!', 'success');
+
+ // If regex gave 'altro', try embedding in background and silently update
+ if (category === 'altro' && typeof classifyCategoryByEmbedding === 'function') {
+ classifyCategoryByEmbedding(name).then(async embCat => {
+ if (!embCat || !result.id) return;
+ try {
+ await api('product_save', {}, 'POST', {
+ id: result.id,
+ name: name,
+ brand: '',
+ category: embCat,
+ unit: 'pz',
+ default_quantity: 1,
+ });
+ if (currentProduct && currentProduct.id === result.id) {
+ currentProduct.category = embCat;
+ }
+ } catch (_) { /* silent */ }
+ });
+ }
+
showProductAction();
} else {
showLoading(false);
@@ -4614,6 +4740,20 @@ function autoDetectCategory() {
return;
}
}
+
+ // ── Embedding fallback: async, only when keywords didn't match ──────────
+ // Kick off model load (no-op if already loaded/loading) and update the
+ // select once the result is ready. Only runs when pipeline is available.
+ if (typeof classifyCategoryByEmbedding === 'function') {
+ classifyCategoryByEmbedding(document.getElementById('pf-name').value).then(embCat => {
+ if (!embCat) return;
+ // Re-check manuallySet — user might have picked something while awaiting
+ const sel = document.getElementById('pf-category');
+ if (!sel || sel.dataset.manuallySet === 'true') return;
+ sel.value = embCat;
+ onCategoryChange(true);
+ });
+ }
}
function onCategoryChange(fromAutoDetect = false) {
diff --git a/index.html b/index.html
index 7f44b00..b2e81af 100644
--- a/index.html
+++ b/index.html
@@ -14,6 +14,39 @@
+
+