Best Practices, Error Handling, and Optimization
In this final part, we’ll cover essential best practices for building robust, scalable, and maintainable web scraping applications. You’ll learn advanced error handling techniques, performance optimization strategies, security considerations, and monitoring approaches.
Comprehensive Error Handling
Error Classification System
export enum ErrorType { NETWORK = 'NETWORK', AUTHENTICATION = 'AUTHENTICATION', RATE_LIMIT = 'RATE_LIMIT', PARSING = 'PARSING', VALIDATION = 'VALIDATION', TIMEOUT = 'TIMEOUT', QUOTA_EXCEEDED = 'QUOTA_EXCEEDED', SITE_BLOCKED = 'SITE_BLOCKED'}
export class ScrapingError extends Error { constructor( message: string, public type: ErrorType, public url: string, public statusCode?: number, public retryable: boolean = true, public retryAfter?: number ) { super(message); this.name = 'ScrapingError'; }
toJSON() { return { name: this.name, message: this.message, type: this.type, url: this.url, statusCode: this.statusCode, retryable: this.retryable, retryAfter: this.retryAfter, stack: this.stack }; }}
export class ErrorHandler { static handleFireCrawlError(error: any, url: string): ScrapingError { if (error.response?.status === 429) { return new ScrapingError( 'Rate limit exceeded', ErrorType.RATE_LIMIT, url, 429, true, error.response.headers['retry-after'] ); }
if (error.response?.status === 401) { return new ScrapingError( 'Authentication failed', ErrorType.AUTHENTICATION, url, 401, false ); }
if (error.code === 'ENOTFOUND' || error.code === 'ECONNREFUSED') { return new ScrapingError( 'Network connection failed', ErrorType.NETWORK, url, undefined, true ); }
if (error.code === 'ETIMEDOUT') { return new ScrapingError( 'Request timeout', ErrorType.TIMEOUT, url, undefined, true ); }
return new ScrapingError( error.message || 'Unknown scraping error', ErrorType.PARSING, url, error.response?.status, true ); }}Retry Strategy Implementation
interface RetryConfig { maxAttempts: number; baseDelay: number; maxDelay: number; backoffMultiplier: number; jitter: boolean;}
export class RetryStrategy { private config: RetryConfig;
constructor(config: Partial<RetryConfig> = {}) { this.config = { maxAttempts: 3, baseDelay: 1000, maxDelay: 30000, backoffMultiplier: 2, jitter: true, ...config }; }
async execute<T>( operation: () => Promise<T>, shouldRetry: (error: Error) => boolean = () => true ): Promise<T> { let lastError: Error;
for (let attempt = 1; attempt <= this.config.maxAttempts; attempt++) { try { return await operation(); } catch (error) { lastError = error as Error;
if (error instanceof ScrapingError && !error.retryable) { throw error; }
if (!shouldRetry(error as Error) || attempt === this.config.maxAttempts) { break; }
const delay = this.calculateDelay(attempt, error as ScrapingError); console.log(`⏳ Attempt ${attempt} failed, retrying in ${delay}ms...`); await this.sleep(delay); } }
throw lastError!; }
private calculateDelay(attempt: number, error?: ScrapingError): number { if (error?.retryAfter) { return error.retryAfter * 1000; }
let delay = this.config.baseDelay * Math.pow(this.config.backoffMultiplier, attempt - 1); delay = Math.min(delay, this.config.maxDelay);
if (this.config.jitter) { delay = delay * (0.5 + Math.random() * 0.5); }
return Math.round(delay); }
private sleep(ms: number): Promise<void> { return new Promise(resolve => setTimeout(resolve, ms)); }}
// Usage exampleexport const defaultRetryStrategy = new RetryStrategy({ maxAttempts: 3, baseDelay: 2000, maxDelay: 60000, backoffMultiplier: 2, jitter: true});Performance Optimization
Intelligent Caching System
interface CacheStrategy { ttl: number; maxSize: number; evictionPolicy: 'LRU' | 'LFU' | 'TTL';}
export class SmartCache { private cache = new Map<string, CacheEntry>(); private accessCount = new Map<string, number>(); private strategy: CacheStrategy;
constructor(strategy: CacheStrategy) { this.strategy = strategy; }
async get<T>(key: string): Promise<T | null> { const entry = this.cache.get(key);
if (!entry) return null;
if (this.isExpired(entry)) { this.delete(key); return null; }
// Update access statistics this.accessCount.set(key, (this.accessCount.get(key) || 0) + 1); entry.lastAccessed = Date.now();
return entry.data as T; }
async set<T>(key: string, data: T, customTtl?: number): Promise<void> { const ttl = customTtl || this.strategy.ttl; const entry: CacheEntry = { data, createdAt: Date.now(), lastAccessed: Date.now(), expiresAt: Date.now() + ttl };
// Evict if cache is full if (this.cache.size >= this.strategy.maxSize) { this.evict(); }
this.cache.set(key, entry); this.accessCount.set(key, 1); }
private evict(): void { let keyToEvict: string;
switch (this.strategy.evictionPolicy) { case 'LRU': keyToEvict = this.findLRUKey(); break; case 'LFU': keyToEvict = this.findLFUKey(); break; case 'TTL': keyToEvict = this.findOldestKey(); break; }
this.delete(keyToEvict); }
private findLRUKey(): string { let oldestTime = Date.now(); let lruKey = '';
for (const [key, entry] of this.cache.entries()) { if (entry.lastAccessed < oldestTime) { oldestTime = entry.lastAccessed; lruKey = key; } }
return lruKey; }
private findLFUKey(): string { let minCount = Infinity; let lfuKey = '';
for (const [key, count] of this.accessCount.entries()) { if (count < minCount) { minCount = count; lfuKey = key; } }
return lfuKey; }
private findOldestKey(): string { let oldestTime = Date.now(); let oldestKey = '';
for (const [key, entry] of this.cache.entries()) { if (entry.createdAt < oldestTime) { oldestTime = entry.createdAt; oldestKey = key; } }
return oldestKey; }
private isExpired(entry: CacheEntry): boolean { return Date.now() > entry.expiresAt; }
private delete(key: string): void { this.cache.delete(key); this.accessCount.delete(key); }
getStats() { return { size: this.cache.size, maxSize: this.strategy.maxSize, hitRate: this.calculateHitRate(), evictionPolicy: this.strategy.evictionPolicy }; }
private calculateHitRate(): number { const totalAccess = Array.from(this.accessCount.values()).reduce((sum, count) => sum + count, 0); return totalAccess > 0 ? (this.cache.size / totalAccess) * 100 : 0; }}
interface CacheEntry { data: any; createdAt: number; lastAccessed: number; expiresAt: number;}Request Batching and Pooling
export class RequestPool { private activeRequests = new Map<string, Promise<any>>(); private requestQueue: QueuedRequest[] = []; private processing = false; private maxConcurrent: number; private requestDelay: number;
constructor(maxConcurrent: number = 5, requestDelay: number = 1000) { this.maxConcurrent = maxConcurrent; this.requestDelay = requestDelay; }
async addRequest<T>( key: string, requestFn: () => Promise<T>, priority: number = 0 ): Promise<T> { // Check if request is already in progress const existingRequest = this.activeRequests.get(key); if (existingRequest) { return existingRequest as Promise<T>; }
return new Promise((resolve, reject) => { this.requestQueue.push({ key, requestFn, priority, resolve, reject });
this.requestQueue.sort((a, b) => b.priority - a.priority); this.processQueue(); }); }
private async processQueue(): Promise<void> { if (this.processing || this.requestQueue.length === 0) { return; }
this.processing = true;
while (this.requestQueue.length > 0 && this.activeRequests.size < this.maxConcurrent) { const request = this.requestQueue.shift()!; this.executeRequest(request);
// Add delay between requests if (this.requestQueue.length > 0) { await new Promise(resolve => setTimeout(resolve, this.requestDelay)); } }
this.processing = false;
// Continue processing if there are more requests if (this.requestQueue.length > 0) { setTimeout(() => this.processQueue(), this.requestDelay); } }
private async executeRequest(request: QueuedRequest): Promise<void> { const { key, requestFn, resolve, reject } = request;
const promise = requestFn() .then(result => { resolve(result); return result; }) .catch(error => { reject(error); throw error; }) .finally(() => { this.activeRequests.delete(key); this.processQueue(); });
this.activeRequests.set(key, promise); }
getStats() { return { activeRequests: this.activeRequests.size, queuedRequests: this.requestQueue.length, maxConcurrent: this.maxConcurrent }; }}
interface QueuedRequest { key: string; requestFn: () => Promise<any>; priority: number; resolve: (value: any) => void; reject: (error: any) => void;}
export const globalRequestPool = new RequestPool(10, 500);Security Best Practices
Input Validation and Sanitization
import validator from 'validator';
export class SecurityValidator { static validateUrl(url: string): { valid: boolean; error?: string } { if (!url || typeof url !== 'string') { return { valid: false, error: 'URL is required and must be a string' }; }
if (!validator.isURL(url, { protocols: ['http', 'https'], require_protocol: true })) { return { valid: false, error: 'Invalid URL format' }; }
// Check for suspicious patterns const suspiciousPatterns = [ /localhost/i, /127\.0\.0\.1/, /0\.0\.0\.0/, /192\.168\./, /10\./, /172\.(1[6-9]|2[0-9]|3[0-1])\./, /file:\/\//i, /ftp:\/\//i ];
for (const pattern of suspiciousPatterns) { if (pattern.test(url)) { return { valid: false, error: 'URL not allowed for security reasons' }; } }
return { valid: true }; }
static sanitizeExtractionPrompt(prompt: string): string { if (!prompt || typeof prompt !== 'string') { return ''; }
// Remove potentially dangerous content const cleaned = prompt .replace(/<script[^>]*>.*?<\/script>/gi, '') .replace(/<iframe[^>]*>.*?<\/iframe>/gi, '') .replace(/javascript:/gi, '') .replace(/on\w+\s*=/gi, '') .trim();
// Limit length return cleaned.substring(0, 2000); }
static validateApiKey(apiKey: string): boolean { if (!apiKey || typeof apiKey !== 'string') { return false; }
// Basic format validation return /^[A-Za-z0-9_-]{20,}$/.test(apiKey); }
static rateLimitKey(req: any): string { // Use multiple identifiers for rate limiting const ip = req.ip || req.connection.remoteAddress; const userAgent = req.headers['user-agent'] || ''; const apiKey = req.headers.authorization?.replace('Bearer ', '') || '';
return `${ip}:${userAgent.substring(0, 50)}:${apiKey.substring(0, 10)}`; }}Content Security and Filtering
export class ContentFilter { private static readonly BLOCKED_DOMAINS = new Set([ 'facebook.com', 'instagram.com', 'twitter.com', 'linkedin.com', 'tiktok.com' ]);
private static readonly SENSITIVE_PATTERNS = [ /password/i, /credit.?card/i, /ssn|social.?security/i, /api.?key/i, /token/i, /secret/i ];
static isAllowedDomain(url: string): boolean { try { const domain = new URL(url).hostname.toLowerCase(); return !this.BLOCKED_DOMAINS.has(domain); } catch { return false; } }
static filterSensitiveContent(content: string): string { let filtered = content;
for (const pattern of this.SENSITIVE_PATTERNS) { filtered = filtered.replace(pattern, '[REDACTED]'); }
return filtered; }
static detectPersonalInfo(content: string): string[] { const findings: string[] = [];
// Email detection const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g; if (emailPattern.test(content)) { findings.push('email_addresses'); }
// Phone number detection const phonePattern = /(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g; if (phonePattern.test(content)) { findings.push('phone_numbers'); }
// Credit card detection (basic) const ccPattern = /\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b/g; if (ccPattern.test(content)) { findings.push('potential_credit_cards'); }
return findings; }}Monitoring and Observability
Comprehensive Metrics Collection
export interface ScrapingMetrics { requestCount: number; successCount: number; errorCount: number; averageResponseTime: number; totalCreditsUsed: number; activeJobs: number; queueSize: number;}
export class MetricsCollector { private metrics: Map<string, any> = new Map(); private timers: Map<string, number> = new Map();
startTimer(key: string): void { this.timers.set(key, Date.now()); }
endTimer(key: string): number { const startTime = this.timers.get(key); if (!startTime) return 0;
const duration = Date.now() - startTime; this.timers.delete(key);
this.recordMetric(`${key}_duration`, duration); return duration; }
recordMetric(key: string, value: number): void { const existing = this.metrics.get(key) || { count: 0, sum: 0, min: Infinity, max: -Infinity };
existing.count++; existing.sum += value; existing.min = Math.min(existing.min, value); existing.max = Math.max(existing.max, value); existing.avg = existing.sum / existing.count; existing.last = value; existing.timestamp = Date.now();
this.metrics.set(key, existing); }
incrementCounter(key: string, value: number = 1): void { const existing = this.metrics.get(key) || { count: 0 }; existing.count += value; existing.timestamp = Date.now(); this.metrics.set(key, existing); }
getMetric(key: string): any { return this.metrics.get(key); }
getAllMetrics(): Record<string, any> { const result: Record<string, any> = {}; for (const [key, value] of this.metrics.entries()) { result[key] = value; } return result; }
getScrapingMetrics(): ScrapingMetrics { const requests = this.getMetric('requests') || { count: 0 }; const successes = this.getMetric('successes') || { count: 0 }; const errors = this.getMetric('errors') || { count: 0 }; const responseTime = this.getMetric('response_time_duration') || { avg: 0 }; const credits = this.getMetric('credits_used') || { sum: 0 }; const activeJobs = this.getMetric('active_jobs') || { count: 0 }; const queueSize = this.getMetric('queue_size') || { count: 0 };
return { requestCount: requests.count, successCount: successes.count, errorCount: errors.count, averageResponseTime: responseTime.avg, totalCreditsUsed: credits.sum, activeJobs: activeJobs.count, queueSize: queueSize.count }; }
reset(): void { this.metrics.clear(); this.timers.clear(); }}
export const globalMetrics = new MetricsCollector();Health Check System
export interface HealthStatus { status: 'healthy' | 'degraded' | 'unhealthy'; timestamp: string; services: Record<string, ServiceHealth>; metrics: { uptime: number; memoryUsage: number; cpuUsage: number; };}
export interface ServiceHealth { status: 'up' | 'down' | 'degraded'; responseTime?: number; error?: string; lastCheck: string;}
export class HealthChecker { private services: Map<string, () => Promise<ServiceHealth>> = new Map();
registerService(name: string, checker: () => Promise<ServiceHealth>): void { this.services.set(name, checker); }
async checkHealth(): Promise<HealthStatus> { const serviceResults: Record<string, ServiceHealth> = {}; let overallStatus: 'healthy' | 'degraded' | 'unhealthy' = 'healthy';
// Check all registered services for (const [name, checker] of this.services.entries()) { try { serviceResults[name] = await checker();
if (serviceResults[name].status === 'down') { overallStatus = 'unhealthy'; } else if (serviceResults[name].status === 'degraded' && overallStatus === 'healthy') { overallStatus = 'degraded'; } } catch (error) { serviceResults[name] = { status: 'down', error: error instanceof Error ? error.message : 'Unknown error', lastCheck: new Date().toISOString() }; overallStatus = 'unhealthy'; } }
return { status: overallStatus, timestamp: new Date().toISOString(), services: serviceResults, metrics: { uptime: process.uptime(), memoryUsage: process.memoryUsage().heapUsed / 1024 / 1024, // MB cpuUsage: process.cpuUsage().user / 1000000 // Convert to seconds } }; }}
// Health check implementationsexport const healthChecker = new HealthChecker();
// Register FireCrawl service checkhealthChecker.registerService('firecrawl', async (): Promise<ServiceHealth> => { const startTime = Date.now();
try { // Simple ping to FireCrawl API const result = await firecrawlApp.scrapeUrl('https://httpbin.org/status/200', { formats: ['markdown'], timeout: 5000 });
return { status: result.success ? 'up' : 'degraded', responseTime: Date.now() - startTime, lastCheck: new Date().toISOString() }; } catch (error) { return { status: 'down', error: error instanceof Error ? error.message : 'Unknown error', responseTime: Date.now() - startTime, lastCheck: new Date().toISOString() }; }});
// Register database checkhealthChecker.registerService('database', async (): Promise<ServiceHealth> => { const startTime = Date.now();
try { // Simple database query // await db.query('SELECT 1');
return { status: 'up', responseTime: Date.now() - startTime, lastCheck: new Date().toISOString() }; } catch (error) { return { status: 'down', error: error instanceof Error ? error.message : 'Database connection failed', responseTime: Date.now() - startTime, lastCheck: new Date().toISOString() }; }});Production Deployment Checklist
Essential Configuration
export const PRODUCTION_CONFIG = { // API Configuration api: { timeout: 30000, retryAttempts: 3, rateLimitPerHour: 1000, maxConcurrentRequests: 10 },
// Cache Configuration cache: { ttl: 3600000, // 1 hour maxSize: 1000, evictionPolicy: 'LRU' as const },
// Security Configuration security: { maxUrlLength: 2048, maxPromptLength: 2000, allowedDomains: [], // Empty = all allowed blockedDomains: ['localhost', '127.0.0.1'], requireHttps: true },
// Monitoring Configuration monitoring: { metricsRetention: 86400000, // 24 hours healthCheckInterval: 30000, // 30 seconds alertThresholds: { errorRate: 0.05, // 5% responseTime: 10000, // 10 seconds queueSize: 100 } }};Key Takeaways
- Implement comprehensive error handling with proper classification and retry strategies
- Use intelligent caching and request pooling for optimal performance
- Apply security best practices including input validation and content filtering
- Monitor your application with detailed metrics and health checks
- Follow production deployment best practices for reliability and scalability
Series Conclusion
Congratulations! You’ve completed the comprehensive “Web Scraping with FireCrawl and Vercel SDK” series. You now have the knowledge and tools to build production-ready web scraping applications that are:
- Scalable: Handle high volumes with proper architecture
- Reliable: Robust error handling and monitoring
- Secure: Protected against common vulnerabilities
- Maintainable: Well-structured and documented code
- Cost-effective: Optimized for performance and resource usage
Continue building amazing scraping solutions and remember to always respect website terms of service and robots.txt files!