Refactor: Centralized Supabase Auth and implemented Auth Guards to prevent 401 errors
This commit is contained in:
222
scripts/scrape-distillery-tags.ts
Normal file
222
scripts/scrape-distillery-tags.ts
Normal file
@@ -0,0 +1,222 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { createClient } from '@supabase/supabase-js';
|
||||
|
||||
// --- CONFIGURATION ---
|
||||
const OPENROUTER_MODEL = 'xiaomi/mimo-v2-flash:free';
|
||||
const BATCH_SIZE = 5; // How many distilleries to process before saving checkpoint
|
||||
const CONCURRENCY = 2; // Maximum concurrent requests to OpenRouter
|
||||
const DELAY_MS = 1000; // Small delay between batches to avoid rate limits
|
||||
const CHECKPOINT_FILE = 'distillery_tags_checkpoint.json';
|
||||
const TAGS_OUTPUT_FILE = 'distillery_tags_results.json';
|
||||
|
||||
// --- ENVIRONMENT SETUP ---
|
||||
function loadEnv() {
|
||||
const envPath = '.env.local';
|
||||
if (!fs.existsSync(envPath)) {
|
||||
console.error('❌ .env.local not found');
|
||||
process.exit(1);
|
||||
}
|
||||
const content = fs.readFileSync(envPath, 'utf8');
|
||||
content.split('\n').forEach(line => {
|
||||
const parts = line.split('=');
|
||||
if (parts.length >= 2) {
|
||||
const key = parts[0].trim();
|
||||
const value = parts.slice(1).join('=').trim();
|
||||
process.env[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
loadEnv();
|
||||
|
||||
const supabase = createClient(
|
||||
process.env.NEXT_PUBLIC_SUPABASE_URL!,
|
||||
process.env.SUPABASE_SERVICE_ROLE_KEY! // Use Service Role to bypass RLS for bulk import
|
||||
);
|
||||
|
||||
const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY;
|
||||
|
||||
if (!OPENROUTER_API_KEY) {
|
||||
console.error('❌ OPENROUTER_API_KEY not found in .env.local');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('🎬 Script file loaded. Starting execution...');
|
||||
|
||||
// --- UTILS ---
|
||||
async function sleep(ms: number) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function fetchTagsWithRetry(name: string, region: string, retries = 3, backoff = 2000): Promise<any> {
|
||||
for (let attempt = 1; attempt <= retries; attempt++) {
|
||||
const result = await fetchTagsForDistillery(name, region);
|
||||
if (result) return result;
|
||||
|
||||
if (attempt < retries) {
|
||||
console.log(`⏳ Rate limited or error for ${name}. Retrying in ${backoff / 1000}s (Attempt ${attempt}/${retries})...`);
|
||||
await sleep(backoff);
|
||||
backoff *= 2; // Exponential backoff
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchTagsForDistillery(name: string, region: string) {
|
||||
console.log(`🔍 Processing: ${name} (${region})...`);
|
||||
|
||||
const prompt = `
|
||||
Analyze the whisky distillery "${name}" from the "${region}" region.
|
||||
Provide a comprehensive list of characteristic tasting tags (aroma and flavor notes) that are typical for this distillery's core range.
|
||||
Break them down into these four categories: 'nose', 'taste', 'finish', and 'texture'.
|
||||
Be extremely detailed and specific to this distillery's DNA.
|
||||
Aim for at least 8-10 tags per category.
|
||||
|
||||
Output ONLY a valid JSON object in this format:
|
||||
{
|
||||
"nose": ["tag1", "tag2", ...],
|
||||
"taste": ["tag1", "tag2", ...],
|
||||
"finish": ["tag1", "tag2", ...],
|
||||
"texture": ["tag1", "tag2", ...]
|
||||
}
|
||||
`;
|
||||
|
||||
try {
|
||||
const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${OPENROUTER_API_KEY}`,
|
||||
'HTTP-Referer': 'https://whiskyvault.app',
|
||||
'X-Title': 'Whisky Vault Scraper'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: OPENROUTER_MODEL,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
response_format: { type: 'json_object' }
|
||||
})
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!response.ok) {
|
||||
// Check for rate limit specifically
|
||||
if (response.status === 429 || (data.error && data.error.code === 429)) {
|
||||
return null; // Trigger retry
|
||||
}
|
||||
console.error(`❌ API Error for ${name}: ${response.status}`, data.error || data);
|
||||
return null;
|
||||
}
|
||||
|
||||
const content = data.choices?.[0]?.message?.content;
|
||||
|
||||
if (!content) {
|
||||
// Check if error is inside the data
|
||||
if (data.error) {
|
||||
console.error(`⚠️ OpenRouter Error for ${name}:`, data.error.message);
|
||||
return null;
|
||||
}
|
||||
console.error(`⚠️ No content returned for ${name}. Full response:`, JSON.stringify(data, null, 2));
|
||||
return null;
|
||||
}
|
||||
|
||||
return JSON.parse(content);
|
||||
} catch (error) {
|
||||
console.error(`❌ Fetch Exception for ${name}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// --- MAIN RUNNER ---
|
||||
async function main() {
|
||||
const distilleriesPath = path.join(process.cwd(), 'src/data/distilleries.json');
|
||||
const distilleries = JSON.parse(fs.readFileSync(distilleriesPath, 'utf8'));
|
||||
|
||||
let processedResults: Record<string, any> = {};
|
||||
let lastIndex = 0;
|
||||
|
||||
// Load progress
|
||||
if (fs.existsSync(TAGS_OUTPUT_FILE)) {
|
||||
processedResults = JSON.parse(fs.readFileSync(TAGS_OUTPUT_FILE, 'utf8'));
|
||||
lastIndex = distilleries.findIndex((d: any) => !processedResults[d.name]);
|
||||
if (lastIndex === -1) lastIndex = distilleries.length;
|
||||
console.log(`🔄 Resuming from index ${lastIndex} (${distilleries[lastIndex]?.name || 'Finished'})...`);
|
||||
}
|
||||
|
||||
const total = distilleries.length;
|
||||
console.log(`🚀 Starting scraper for ${total} distilleries using ${OPENROUTER_MODEL}`);
|
||||
|
||||
// Process sequentially for free models to avoid heavy rate limits
|
||||
for (let i = lastIndex; i < total; i++) {
|
||||
const d = distilleries[i];
|
||||
const result = await fetchTagsWithRetry(d.name, d.region);
|
||||
|
||||
if (result) {
|
||||
processedResults[d.name] = result;
|
||||
// Save every success to be safe
|
||||
fs.writeFileSync(TAGS_OUTPUT_FILE, JSON.stringify(processedResults, null, 2));
|
||||
console.log(`✅ [${i + 1}/${total}] Saved: ${d.name}`);
|
||||
} else {
|
||||
console.error(`⏭️ Skipping ${d.name} after failed retries.`);
|
||||
}
|
||||
|
||||
// Small cooling delay between requests
|
||||
await sleep(500);
|
||||
}
|
||||
|
||||
console.log('🎉 Scraping complete! Consolidating and writing to database...');
|
||||
await consolidateAndPush(processedResults);
|
||||
}
|
||||
|
||||
async function consolidateAndPush(allData: Record<string, any>) {
|
||||
const uniqueTags: Map<string, string> = new Map(); // "Tag Name" -> "Category"
|
||||
|
||||
Object.entries(allData).forEach(([distillery, categories]: [string, any]) => {
|
||||
['nose', 'taste', 'finish', 'texture'].forEach(cat => {
|
||||
const tags = categories[cat] || [];
|
||||
tags.forEach((tag: string) => {
|
||||
const normalized = tag.trim();
|
||||
// Filter: Only allow tags that are 1 or 2 words long
|
||||
const wordCount = normalized.split(/\s+/).filter(w => w.length > 0).length;
|
||||
if (normalized && wordCount <= 2) {
|
||||
// In our schema, a tag is unique per category
|
||||
const key = `${normalized}:${cat}`;
|
||||
uniqueTags.set(key, cat);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
console.log(`📊 Found ${uniqueTags.size} unique (Tag, Category) pairs.`);
|
||||
|
||||
const tagsToInsert = Array.from(uniqueTags.entries()).map(([key, category]) => {
|
||||
const name = key.split(':')[0];
|
||||
return {
|
||||
name,
|
||||
category,
|
||||
is_system_default: true,
|
||||
popularity_score: 3
|
||||
};
|
||||
});
|
||||
|
||||
// Chunk database inserts
|
||||
const DB_BATCH_SIZE = 100;
|
||||
for (let i = 0; i < tagsToInsert.length; i += DB_BATCH_SIZE) {
|
||||
const chunk = tagsToInsert.slice(i, i + DB_BATCH_SIZE);
|
||||
const { error } = await supabase
|
||||
.from('tags')
|
||||
.upsert(chunk, { onConflict: 'name,category' });
|
||||
|
||||
if (error) {
|
||||
console.error('❌ Database error:', error);
|
||||
} else {
|
||||
console.log(`📤 Pushed ${i + chunk.length}/${tagsToInsert.length} tags to database.`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('🏁 All done!');
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('💨 Script crashed:', err);
|
||||
});
|
||||
Reference in New Issue
Block a user