feat: offline OCR (Tesseract) + embedding category classifier (@xenova/transformers)
Tesseract OCR (PHP, server-side):
- Dockerfile: adds tesseract-ocr + tesseract-ocr-ita + libgd-dev (gd extension)
- api/index.php: new tesseractReadExpiry() — decodes base64 image, pre-processes with GD (2× upscale, greyscale, auto-contrast, sharpen), runs tesseract CLI with ita+eng PSM-6, extracts date with multi-pattern regex (DD/MM/YYYY, MM/YYYY, ISO, named-month), returns YYYY-MM-DD + confidence
- geminiReadExpiry() now: (1) tries Tesseract first; (2) falls back to Gemini Vision if OCR returns null or no date found; (3) passes source ('ocr'|'gemini') in response
@xenova/transformers embedding classifier (browser-side):
- index.html: ES-module bootstrap that lazy-loads 'Xenova/all-MiniLM-L6-v2' quantized (~23 MB, cached in browser) via window._getCategoryPipeline(); pre-warms on first scan page visit
- assets/js/app.js: classifyCategoryByEmbedding(name) — embeds product name + 16 category anchor descriptions, cosine similarity, threshold 0.30; results cached in _embeddingCache Map
- autoDetectCategory(): after keyword map misses, fires classifyCategoryByEmbedding async and updates select when resolved (respects manuallySet flag)
- createQuickProduct(): if regex returned 'altro', silently patches category with embedding result via a background api call
This commit is contained in:
+33
@@ -14,6 +14,39 @@
|
||||
<link rel="stylesheet" href="assets/css/style.css?v=20260421a">
|
||||
<!-- QuaggaJS for barcode scanning -->
|
||||
<script src="https://cdn.jsdelivr.net/npm/@ericblade/quagga2@1.8.4/dist/quagga.min.js"></script>
|
||||
<!-- @xenova/transformers: ES-module bootstrap that exposes a lazy category-classifier as window._categoryPipelinePromise -->
|
||||
<script type="module">
|
||||
// Lazy-load the embedding pipeline only when first needed.
|
||||
// Using a dynamic import so the ~2 MB WASM is not fetched on page load.
|
||||
window._categoryPipelineReady = false;
|
||||
window._categoryPipelinePromise = null;
|
||||
|
||||
window._getCategoryPipeline = async function() {
|
||||
if (window._categoryPipelinePromise) return window._categoryPipelinePromise;
|
||||
window._categoryPipelinePromise = (async () => {
|
||||
try {
|
||||
const { pipeline, env } = await import(
|
||||
'https://cdn.jsdelivr.net/npm/@xenova/transformers@2/src/transformers.min.js'
|
||||
);
|
||||
// Keep WASM/model files in the browser cache; disable remote model check
|
||||
// to avoid CORS issues with the self-hosted instance.
|
||||
env.allowRemoteModels = true;
|
||||
env.useBrowserCache = true;
|
||||
const pipe = await pipeline(
|
||||
'feature-extraction',
|
||||
'Xenova/all-MiniLM-L6-v2',
|
||||
{ quantized: true }
|
||||
);
|
||||
window._categoryPipelineReady = true;
|
||||
return pipe;
|
||||
} catch (e) {
|
||||
console.warn('[EverShelf] Embedding model unavailable, regex fallback only:', e);
|
||||
return null;
|
||||
}
|
||||
})();
|
||||
return window._categoryPipelinePromise;
|
||||
};
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user