([^<]+)<\/title>/i); const title = titleMatch ? titleMatch[1].trim() : url; const text = extractTextFromHtml(html); if (!text || text.length < 50) { throw new Error("Could not extract meaningful text from this URL. It might be a single-page app or protected."); } return { text, title }; } catch (error: any) { console.error(`[Scraper] Failed to scrape ${url}:`, error.message); throw new Error(`Failed to scrape URL: ${error.message}`); } }

import axios from 'axios'; /** * Basic HTML to text extractor using regex. * While not perfect, it avoids adding heavy dependencies like cheerio or jsdom. */ function extractTextFromHtml(html: string): string { // Remove scripts and styles let text = html.replace(/]*>([\s\S]*?)<\/script>/gmi, " "); text = text.replace(/]*>([\s\S]*?)<\/style>/gmi, " "); // Replace common block tags with spaces/newlines text = text.replace(/<(p|div|br|h1|h2|h3|h4|h5|h6|li|tr)[^>]*>/gi, "\n"); // Strip all other HTML tags text = text.replace(/<[^>]+>/g, " "); // Decode basic entities (simple version) text = text.replace(/ /g, " ") .replace(/</g, "<") .replace(/>/g, ">") .replace(/&/g, "&") .replace(/"/g, '"') .replace(/'/g, "'"); // Clean up whitespace return text.replace(/\n\s*\n/g, "\n") // Remove multiple newlines .replace(/[ \t]+/g, " ") // Remove multiple spaces .trim(); } /** * Scrapes a URL and returns the text content and page title. */ export async function scrapeUrl(url: string): Promise<{ text: string; title: string }> { try { const response = await axios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }, timeout: 10000 // 10s timeout }); const html = response.data; if (typeof html !== 'string') { throw new Error("Invalid response from URL (not text/html)"); } const titleMatch = html.match(/([^<]+)<\/title>/i); const title = titleMatch ? titleMatch[1].trim() : url; const text = extractTextFromHtml(html); if (!text || text.length < 50) { throw new Error("Could not extract meaningful text from this URL. It might be a single-page app or protected."); } return { text, title }; } catch (error: any) { console.error(`[Scraper] Failed to scrape ${url}:`, error.message); throw new Error(`Failed to scrape URL: ${error.message}`); } }