feat: improved local OCR with Strip & Match distillery detection

- Added comprehensive distillery database (200+ entries) - Implemented Strip & Match heuristic for fuzzy matching - Added contextual age detection from distillery lines - Added whitespace normalization for OCR text - Disabled local name extraction (too noisy, let Gemini handle it) - Fixed confidence scale normalization in TastingEditor (0-1 vs 0-100) - Improved extractName filter (60% letters required) - Relaxed Fuse.js thresholds for partial matches
2025-12-25 13:14:08 +01:00
parent a1a91795d1
commit afe9197776
17 changed files with 3642 additions and 262 deletions
--- a/src/lib/ocr/local-engine.ts
+++ b/src/lib/ocr/local-engine.ts
@@ -0,0 +1,313 @@
+/**
+ * Local OCR Engine
+ * Client-side OCR using Tesseract.js with Fuse.js fuzzy matching
+ * 
+ * Optimized for whisky label scanning with:
+ * - Image preprocessing (grayscale, binarization, center crop)
+ * - PSM 11 (Sparse text mode)
+ * - Character whitelisting
+ * - Bag-of-words fuzzy matching
+ */
+
+import Tesseract from 'tesseract.js';
+import Fuse from 'fuse.js';
+import { extractNumbers, ExtractedNumbers, preprocessImageForOCR } from './scanner-utils';
+import distilleries from '@/data/distilleries.json';
+
+export interface LocalOcrResult {
+    distillery: string | null;
+    distilleryRegion: string | null;
+    name: string | null;
+    age: number | null;
+    abv: number | null;
+    vintage: string | null;
+    rawText: string;
+    confidence: number;
+}
+
+// Fuse.js configuration for fuzzy matching distillery names
+// Balanced matching to catch partial OCR errors while avoiding false positives
+const fuseOptions = {
+    keys: ['name'],
+    threshold: 0.35,        // 0 = exact match, 0.35 = allow some fuzziness
+    distance: 50,           // Characters between matched chars
+    includeScore: true,
+    minMatchCharLength: 4,  // Minimum chars to match
+};
+
+const distilleryFuse = new Fuse(distilleries, fuseOptions);
+
+// Tesseract worker singleton (reused across scans)
+let tesseractWorker: Tesseract.Worker | null = null;
+
+// Character whitelist for whisky labels (no special symbols that cause noise)
+const CHAR_WHITELIST = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789%.,:\'"-/ ';
+
+/**
+ * Initialize or get the Tesseract worker
+ * Uses local files from /public/tessdata for offline capability
+ */
+async function getWorker(): Promise<Tesseract.Worker> {
+    if (tesseractWorker) {
+        return tesseractWorker;
+    }
+
+    console.log('[LocalOCR] Initializing Tesseract worker with local files...');
+
+    // Use local files from /public/tessdata
+    tesseractWorker = await Tesseract.createWorker('eng', Tesseract.OEM.LSTM_ONLY, {
+        corePath: '/tessdata/',
+        langPath: '/tessdata/',
+        logger: (m) => {
+            if (m.status === 'recognizing text') {
+                console.log(`[LocalOCR] Progress: ${Math.round(m.progress * 100)}%`);
+            } else {
+                console.log(`[LocalOCR] ${m.status}`);
+            }
+        },
+    });
+
+    // Configure Tesseract for whisky label OCR
+    await tesseractWorker.setParameters({
+        tessedit_pageseg_mode: Tesseract.PSM.SINGLE_BLOCK, // PSM 6 - treat as single block of text
+        tessedit_char_whitelist: CHAR_WHITELIST,
+        preserve_interword_spaces: '1',  // Keep word spacing
+    });
+
+    console.log('[LocalOCR] Tesseract worker ready (PSM: SINGLE_BLOCK, Whitelist enabled)');
+    return tesseractWorker;
+}
+
+/**
+ * Run OCR on an image and extract whisky metadata
+ * 
+ * @param imageSource - File, Blob, or base64 string of the image
+ * @param timeoutMs - Maximum time to wait for OCR (default 10000ms)
+ * @returns LocalOcrResult with extracted metadata
+ */
+export async function analyzeLocalOcr(
+    imageSource: File | Blob | string,
+    timeoutMs: number = 10000
+): Promise<LocalOcrResult> {
+    const result: LocalOcrResult = {
+        distillery: null,
+        distilleryRegion: null,
+        name: null,
+        age: null,
+        abv: null,
+        vintage: null,
+        rawText: '',
+        confidence: 0,
+    };
+
+    try {
+        // Step 1: Preprocess the image for better OCR
+        let processedImage: string;
+        if (typeof imageSource === 'string') {
+            // Already a data URL, use as-is (can't preprocess string)
+            processedImage = imageSource;
+            console.log('[LocalOCR] Using raw image (string input)');
+        } else {
+            // Preprocess File/Blob: grayscale + sharpen + contrast boost
+            console.log('[LocalOCR] Preprocessing image...');
+            processedImage = await preprocessImageForOCR(imageSource);
+            // Uses defaults: 5% edge crop, 1200px height, sharpen=true, 1.3x contrast
+        }
+
+        // Create a timeout promise
+        const timeoutPromise = new Promise<never>((_, reject) => {
+            setTimeout(() => reject(new Error('OCR timeout')), timeoutMs);
+        });
+
+        // Race OCR against timeout
+        const worker = await getWorker();
+        const ocrResult = await Promise.race([
+            worker.recognize(processedImage),
+            timeoutPromise,
+        ]);
+
+        result.rawText = ocrResult.data.text;
+        result.confidence = ocrResult.data.confidence / 100; // Normalize to 0-1
+
+        // Extract numbers using regex (this works reliably)
+        const numbers = extractNumbers(result.rawText);
+        result.abv = numbers.abv;
+        result.age = numbers.age;
+        result.vintage = numbers.vintage;
+
+        // NOTE: Distillery fuzzy matching disabled - causes too many false positives
+        // with noisy OCR text. Let Gemini Vision handle distillery identification.
+        // const distilleryMatch = findDistillery(result.rawText);
+        // if (distilleryMatch) {
+        //     result.distillery = distilleryMatch.name;
+        //     result.distilleryRegion = distilleryMatch.region;
+        // }
+
+        // Fuzzy match distillery (new algorithm with sanity checks)
+        const distilleryMatch = findDistillery(result.rawText);
+        if (distilleryMatch) {
+            result.distillery = distilleryMatch.name;
+            result.distilleryRegion = distilleryMatch.region;
+
+            // Use contextual age if regex extraction failed
+            if (!result.age && distilleryMatch.contextualAge) {
+                result.age = distilleryMatch.contextualAge;
+                console.log(`[LocalOCR] Using contextual age: ${result.age}`);
+            }
+        }
+
+        // NOTE: Name extraction disabled - Tesseract too noisy for full bottle names
+        // Let Gemini Vision handle the name field
+        // result.name = extractName(result.rawText, result.distillery);
+        result.name = null;
+
+        // Detailed logging for debugging
+        const cleanedText = result.rawText
+            .split('\n')
+            .map(line => line.trim())
+            .filter(line => line.length > 0)
+            .join(' | ');
+
+        console.log('[LocalOCR] ========== OCR RESULTS ==========');
+        console.log('[LocalOCR] Raw Text:\n', result.rawText);
+        console.log('[LocalOCR] Cleaned Text:', cleanedText);
+        console.log('[LocalOCR] Confidence:', (result.confidence * 100).toFixed(1) + '%');
+        console.log('[LocalOCR] Extracted Data:', {
+            distillery: result.distillery,
+            distilleryRegion: result.distilleryRegion,
+            name: result.name,
+            age: result.age,
+            abv: result.abv,
+            vintage: result.vintage,
+        });
+        console.log('[LocalOCR] ===================================');
+
+        return result;
+
+    } catch (error) {
+        console.warn('[LocalOCR] Analysis failed:', error);
+        return result; // Return partial/empty result
+    }
+}
+
+/**
+ * Find a distillery name in OCR text using fuzzy matching
+ * 
+ * Strategy:
+ * 1. Normalize whitespace (fix Tesseract's formatting gaps)
+ * 2. Split OCR text into lines, filter garbage
+ * 3. "Strip & Match": Remove numbers before Fuse matching (helps with "N NEVIS 27")
+ * 4. Sanity check: match length must be reasonable
+ * 5. Contextual age: if distillery found, look for age in original line
+ */
+function findDistillery(text: string): { name: string; region: string; contextualAge?: number } | null {
+    // Split into lines, normalize whitespace, and filter garbage
+    const lines = text
+        .split('\n')
+        .map(line => line.trim().replace(/\s+/g, ' ')) // Normalize whitespace
+        .filter(line => {
+            // Minimum 4 characters
+            if (line.length < 4) return false;
+            // Must have at least 40% letters (lowered to allow lines with numbers)
+            const letters = line.replace(/[^a-zA-Z]/g, '');
+            return letters.length >= line.length * 0.4;
+        });
+
+    console.log('[LocalOCR] Lines for distillery matching:', lines.length);
+
+    // Try to match each line
+    for (const originalLine of lines) {
+        // STRIP & MATCH: Remove numbers for cleaner Fuse matching
+        // "Bad N NEVIS 27" → "Bad N NEVIS "
+        const textOnlyLine = originalLine.replace(/[0-9]/g, '').replace(/\s+/g, ' ').trim();
+
+        if (textOnlyLine.length < 4) continue;
+
+        const results = distilleryFuse.search(textOnlyLine);
+
+        if (results.length > 0 && results[0].score !== undefined && results[0].score < 0.4) {
+            const match = results[0].item;
+            const matchScore = results[0].score;
+
+            // SANITY CHECK: The text-only part should be similar length to distillery name
+            // Max 60% deviation allowed (relaxed for partial matches)
+            const lengthRatio = textOnlyLine.length / match.name.length;
+            const lengthDeviation = Math.abs(1 - lengthRatio);
+
+            if (lengthDeviation > 0.6) {
+                console.log(`[LocalOCR] Match rejected (length): "${textOnlyLine}" → ${match.name} (ratio: ${lengthRatio.toFixed(2)}, deviation: ${(lengthDeviation * 100).toFixed(0)}%)`);
+                continue;
+            }
+
+            // CONTEXTUAL AGE DETECTION: Look for 2-digit number (3-60) in ORIGINAL line
+            let contextualAge: number | undefined;
+            const ageMatch = originalLine.match(/\b(\d{1,2})\b/);
+            if (ageMatch) {
+                const potentialAge = parseInt(ageMatch[1], 10);
+                if (potentialAge >= 3 && potentialAge <= 60) {
+                    contextualAge = potentialAge;
+                    console.log(`[LocalOCR] Contextual age detected: ${potentialAge} years`);
+                }
+            }
+
+            console.log(`[LocalOCR] Distillery match: "${textOnlyLine}" → ${match.name} (score: ${matchScore.toFixed(3)}, original: "${originalLine}")`);
+            return {
+                name: match.name,
+                region: match.region,
+                contextualAge,
+            };
+        }
+    }
+
+    return null;
+}
+
+/**
+ * Extract a potential bottle name from OCR text
+ */
+function extractName(text: string, distillery: string | null): string | null {
+    const lines = text
+        .split('\n')
+        .map(l => l.trim())
+        .filter(line => {
+            // Minimum 5 characters
+            if (line.length < 5) return false;
+            // Must have at least 60% letters (filter out garbage like "ee" or "4 . .")
+            const letters = line.replace(/[^a-zA-Z]/g, '');
+            if (letters.length < line.length * 0.6) return false;
+            // Skip lines that are just punctuation/numbers
+            if (/^[\d\s.,\-'"]+$/.test(line)) return false;
+            return true;
+        });
+
+    // Skip lines that are just the distillery name
+    const candidates = lines.filter(line => {
+        if (distillery && line.toLowerCase().includes(distillery.toLowerCase())) {
+            // Only skip if the line IS the distillery name (not contains more)
+            return line.length > distillery.length + 5;
+        }
+        return true;
+    });
+
+    // Return the first substantial line (likely the bottle name)
+    for (const line of candidates) {
+        // Skip lines that look like numbers only
+        if (/^\d+[\s%]+/.test(line)) continue;
+        // Skip lines that are just common whisky words
+        if (/^(single|malt|scotch|whisky|whiskey|aged|years?|proof|edition|distilled|distillery)$/i.test(line)) continue;
+
+        return line;
+    }
+
+    return null;
+}
+
+/**
+ * Terminate the Tesseract worker (call on cleanup)
+ */
+export async function terminateOcrWorker(): Promise<void> {
+    if (tesseractWorker) {
+        await tesseractWorker.terminate();
+        tesseractWorker = null;
+    }
+}
--- a/src/lib/ocr/scanner-utils.ts
+++ b/src/lib/ocr/scanner-utils.ts
@@ -0,0 +1,312 @@
+/**
+ * Scanner Utilities
+ * Cache checking and helper functions for client-side OCR
+ */
+
+/**
+ * Check if Tesseract.js is ready to run
+ * When online, tesseract will auto-download from CDN, so return true
+ * When offline, check if files are cached
+ * @returns Promise<boolean> - true if OCR can run
+ */
+export async function isTesseractReady(): Promise<boolean> {
+    if (typeof window === 'undefined') {
+        return false;
+    }
+
+    // If online, tesseract.js will auto-download what it needs
+    if (navigator.onLine) {
+        console.log('[Scanner] Online - tesseract will use CDN');
+        return true;
+    }
+
+    // If offline, check cache
+    if (!('caches' in window)) {
+        console.log('[Scanner] Offline + no cache API - tesseract not ready');
+        return false;
+    }
+
+    try {
+        // Check for the core files in cache (matching actual file names in /public/tessdata)
+        const wasmMatch = await window.caches.match('/tessdata/tesseract-core-simd.wasm');
+        const langMatch = await window.caches.match('/tessdata/eng.traineddata');
+
+        const ready = !!(wasmMatch && langMatch);
+        console.log('[Scanner] Offline cache check:', { wasmMatch: !!wasmMatch, langMatch: !!langMatch, ready });
+        return ready;
+    } catch (error) {
+        console.warn('[Scanner] Cache check failed:', error);
+        return false;
+    }
+}
+
+/**
+ * Extract numeric values from OCR text using regex patterns
+ */
+export interface ExtractedNumbers {
+    abv: number | null;
+    age: number | null;
+    vintage: string | null;
+}
+
+export function extractNumbers(text: string): ExtractedNumbers {
+    const result: ExtractedNumbers = {
+        abv: null,
+        age: null,
+        vintage: null
+    };
+
+    if (!text) return result;
+
+    // Normalize text: lowercase, clean up common OCR mistakes
+    const normalizedText = text
+        .replace(/[oO]/g, '0')  // Common OCR mistake: O -> 0
+        .replace(/[lI]/g, '1')  // Common OCR mistake: l/I -> 1
+        .toLowerCase();
+
+    // ABV patterns: "43%", "43.5%", "43,5 %", "ABV 43", "vol. 43"
+    const abvPatterns = [
+        /(\d{2}[.,]\d{1,2})\s*%/,           // 43.5% or 43,5 %
+        /(\d{2})\s*%/,                        // 43%
+        /abv[:\s]*(\d{2}[.,]?\d{0,2})/i,      // ABV: 43 or ABV 43.5
+        /vol[.\s]*(\d{2}[.,]?\d{0,2})/i,      // vol. 43
+        /(\d{2}[.,]\d{1,2})\s*vol/i,          // 43.5 vol
+    ];
+
+    for (const pattern of abvPatterns) {
+        const match = normalizedText.match(pattern);
+        if (match) {
+            const value = parseFloat(match[1].replace(',', '.'));
+            if (value >= 35 && value <= 75) {  // Reasonable whisky ABV range
+                result.abv = value;
+                break;
+            }
+        }
+    }
+
+    // Age patterns: "12 years", "12 year old", "12 YO", "aged 12"
+    const agePatterns = [
+        /(\d{1,2})\s*(?:years?|yrs?|y\.?o\.?|jahre?)/i,
+        /aged\s*(\d{1,2})/i,
+        /(\d{1,2})\s*year\s*old/i,
+    ];
+
+    for (const pattern of agePatterns) {
+        const match = text.match(pattern);
+        if (match) {
+            const value = parseInt(match[1], 10);
+            if (value >= 3 && value <= 60) {  // Reasonable whisky age range
+                result.age = value;
+                break;
+            }
+        }
+    }
+
+    // Vintage patterns: "1990", "Vintage 1990", "Distilled 1990"
+    const vintagePatterns = [
+        /(?:vintage|distilled|dist\.?)\s*(19\d{2}|20[0-2]\d)/i,
+        /\b(19[789]\d|20[0-2]\d)\b/,  // Years 1970-2029
+    ];
+
+    for (const pattern of vintagePatterns) {
+        const match = text.match(pattern);
+        if (match) {
+            const year = parseInt(match[1], 10);
+            const currentYear = new Date().getFullYear();
+            if (year >= 1970 && year <= currentYear) {
+                result.vintage = match[1];
+                break;
+            }
+        }
+    }
+
+    return result;
+}
+
+/**
+ * Convert an image blob to base64 string
+ */
+export function imageToBase64(blob: Blob): Promise<string> {
+    return new Promise((resolve, reject) => {
+        const reader = new FileReader();
+        reader.onload = () => {
+            if (typeof reader.result === 'string') {
+                resolve(reader.result);
+            } else {
+                reject(new Error('Failed to convert image to base64'));
+            }
+        };
+        reader.onerror = reject;
+        reader.readAsDataURL(blob);
+    });
+}
+
+/**
+ * Check if the browser is online
+ */
+export function isOnline(): boolean {
+    return typeof navigator !== 'undefined' && navigator.onLine;
+}
+
+/**
+ * Options for image preprocessing
+ */
+export interface PreprocessOptions {
+    /** Crop left/right edges (0-0.25) to remove bottle curves. Default: 0.05 */
+    edgeCrop?: number;
+    /** Target height for resizing. Default: 1200 */
+    targetHeight?: number;
+    /** Apply binarization (hard black/white). Default: false */
+    binarize?: boolean;
+    /** Contrast boost factor (1.0 = no change). Default: 1.3 */
+    contrastBoost?: number;
+    /** Apply sharpening. Default: true */
+    sharpen?: boolean;
+}
+
+/**
+ * Preprocess an image for better OCR results
+ * 
+ * Applies:
+ * 1. Center crop (removes curved bottle edges)
+ * 2. Resize to optimal OCR size
+ * 3. Grayscale conversion
+ * 4. Sharpening (helps with blurry text)
+ * 5. Contrast enhancement
+ * 6. Optional binarization
+ * 
+ * @param imageSource - File, Blob, or HTMLImageElement
+ * @param options - Preprocessing options
+ * @returns Promise<string> - Preprocessed image as data URL
+ */
+export async function preprocessImageForOCR(
+    imageSource: File | Blob | HTMLImageElement,
+    options: PreprocessOptions = {}
+): Promise<string> {
+    const {
+        edgeCrop = 0.05,        // Remove 5% from each edge (minimal)
+        targetHeight = 1200,    // High resolution
+        binarize = false,       // Don't binarize by default
+        contrastBoost = 1.3,    // 30% contrast boost
+        sharpen = false,        // Disabled - creates noise on photos
+    } = options;
+
+    // Load image into an HTMLImageElement if not already
+    let img: HTMLImageElement;
+
+    if (imageSource instanceof HTMLImageElement) {
+        img = imageSource;
+    } else {
+        img = await loadImageFromBlob(imageSource as Blob);
+    }
+
+    // Create canvas
+    const canvas = document.createElement('canvas');
+    const ctx = canvas.getContext('2d')!;
+
+    // Calculate crop dimensions (remove edges to focus on center)
+    const cropX = Math.floor(img.width * edgeCrop);
+    const cropWidth = img.width - (cropX * 2);
+    const cropHeight = img.height;
+
+    // Calculate resize dimensions (maintain aspect ratio)
+    const scale = targetHeight / cropHeight;
+    const newWidth = Math.floor(cropWidth * scale);
+    const newHeight = targetHeight;
+
+    canvas.width = newWidth;
+    canvas.height = newHeight;
+
+    // Draw cropped & resized image
+    ctx.drawImage(
+        img,
+        cropX, 0, cropWidth, cropHeight,  // Source: center crop
+        0, 0, newWidth, newHeight          // Destination: full canvas
+    );
+
+    // Get pixel data for processing
+    const imageData = ctx.getImageData(0, 0, newWidth, newHeight);
+    const data = imageData.data;
+
+    // First pass: Convert to grayscale
+    for (let i = 0; i < data.length; i += 4) {
+        const r = data[i];
+        const g = data[i + 1];
+        const b = data[i + 2];
+        const gray = 0.2126 * r + 0.7152 * g + 0.0722 * b;
+        data[i] = data[i + 1] = data[i + 2] = gray;
+    }
+
+    // Apply sharpening using a 3x3 kernel
+    if (sharpen) {
+        const tempData = new Uint8ClampedArray(data);
+        // Sharpen kernel: enhances edges
+        // [ 0, -1,  0]
+        // [-1,  5, -1]
+        // [ 0, -1,  0]
+        const kernel = [0, -1, 0, -1, 5, -1, 0, -1, 0];
+
+        for (let y = 1; y < newHeight - 1; y++) {
+            for (let x = 1; x < newWidth - 1; x++) {
+                let sum = 0;
+                for (let ky = -1; ky <= 1; ky++) {
+                    for (let kx = -1; kx <= 1; kx++) {
+                        const idx = ((y + ky) * newWidth + (x + kx)) * 4;
+                        const ki = (ky + 1) * 3 + (kx + 1);
+                        sum += tempData[idx] * kernel[ki];
+                    }
+                }
+                const idx = (y * newWidth + x) * 4;
+                const clamped = Math.min(255, Math.max(0, sum));
+                data[idx] = data[idx + 1] = data[idx + 2] = clamped;
+            }
+        }
+    }
+
+    // Second pass: Apply contrast enhancement
+    for (let i = 0; i < data.length; i += 4) {
+        let gray = data[i];
+        gray = ((gray - 128) * contrastBoost) + 128;
+        gray = Math.min(255, Math.max(0, gray));
+
+        if (binarize) {
+            gray = gray >= 128 ? 255 : 0;
+        }
+
+        data[i] = data[i + 1] = data[i + 2] = gray;
+    }
+
+    // Put processed data back
+    ctx.putImageData(imageData, 0, 0);
+
+    console.log('[PreprocessOCR] Image preprocessed:', {
+        original: `${img.width}x${img.height}`,
+        cropped: `${cropWidth}x${cropHeight} (${(edgeCrop * 100).toFixed(0)}% edge crop)`,
+        final: `${newWidth}x${newHeight}`,
+        sharpen,
+        contrastBoost,
+        mode: binarize ? 'binarized' : 'grayscale',
+    });
+
+    return canvas.toDataURL('image/png');
+}
+
+/**
+ * Load an image from a Blob/File into an HTMLImageElement
+ */
+function loadImageFromBlob(blob: Blob): Promise<HTMLImageElement> {
+    return new Promise((resolve, reject) => {
+        const img = new Image();
+        const url = URL.createObjectURL(blob);
+
+        img.onload = () => {
+            URL.revokeObjectURL(url);
+            resolve(img);
+        };
+        img.onerror = () => {
+            URL.revokeObjectURL(url);
+            reject(new Error('Failed to load image'));
+        };
+        img.src = url;
+    });
+}