Building a Production Web Scraping Application
In this comprehensive part, we’ll build a complete production-ready web scraping application called “ScrapeMaster” that combines all the techniques we’ve learned. This will be a full-stack application with user management, job queuing, real-time updates, and enterprise features.
Application Architecture Overview
System Components
ScrapeMaster Architecture:┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐│ Frontend │ │ API Gateway │ │ Job Queue ││ (Next.js) │◄──►│ (Vercel) │◄──►│ (Redis) │└─────────────────┘ └─────────────────┘ └─────────────────┘ │ │ ▼ ▼┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐│ Database │◄──►│ Auth Service │ │ Workers ││ (PostgreSQL) │ │ (NextAuth) │ │ (Vercel) │└─────────────────┘ └─────────────────┘ └─────────────────┘ │ ▼ ┌─────────────────┐ │ Monitoring │ │ (Vercel) │ └─────────────────┘Core Features
- User Authentication & Management
- Job Queue System with Real-time Updates
- Subscription & Billing Management
- Advanced Analytics Dashboard
- API Rate Limiting & Usage Tracking
- Webhook Support for Job Completion
- Data Export & Visualization
Database Schema Design
export interface User { id: string; email: string; name: string; avatar?: string; plan: 'free' | 'pro' | 'enterprise'; apiKey: string; createdAt: Date; updatedAt: Date; subscription?: Subscription;}
export interface Subscription { id: string; userId: string; plan: string; status: 'active' | 'canceled' | 'past_due'; currentPeriodStart: Date; currentPeriodEnd: Date; cancelAtPeriodEnd: boolean; stripeCustomerId: string; stripeSubscriptionId: string;}
export interface ScrapingJob { id: string; userId: string; name: string; url: string; status: 'pending' | 'running' | 'completed' | 'failed' | 'canceled'; jobType: 'single' | 'crawl' | 'monitor'; config: ScrapingConfig; results?: ScrapingResult[]; error?: string; progress: number; creditsUsed: number; webhook?: string; scheduledFor?: Date; createdAt: Date; updatedAt: Date; completedAt?: Date;}
export interface ScrapingConfig { extractionPrompt?: string; maxPages?: number; includePatterns?: string[]; excludePatterns?: string[]; formats: ('markdown' | 'html' | 'structured')[]; options: { onlyMainContent: boolean; includeTags?: string[]; excludeTags?: string[]; timeout: number; };}
export interface ScrapingResult { id: string; jobId: string; url: string; title: string; content: string; metadata: Record<string, any>; extractedData?: Record<string, any>; scrapedAt: Date;}
export interface UsageLog { id: string; userId: string; jobId: string; action: 'scrape' | 'crawl' | 'extract'; creditsUsed: number; responseTime: number; success: boolean; timestamp: Date;}Job Queue System
Redis-based Queue Implementation
import Redis from 'ioredis';import { ScrapingJob, ScrapingConfig } from '../database/schema';
export interface QueueJob { id: string; type: 'scrape' | 'crawl' | 'monitor'; data: { jobId: string; userId: string; url: string; config: ScrapingConfig; webhook?: string; }; priority: number; attempts: number; maxAttempts: number; delay?: number;}
export class JobQueue { private redis: Redis; private queueName = 'scraping-jobs';
constructor() { this.redis = new Redis(process.env.REDIS_URL!); }
async addJob(job: Omit<QueueJob, 'id' | 'attempts'>): Promise<string> { const jobId = `job_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; const queueJob: QueueJob = { ...job, id: jobId, attempts: 0 };
const score = job.priority || Date.now(); await this.redis.zadd(this.queueName, score, JSON.stringify(queueJob));
console.log(`📋 Job added to queue: ${jobId}`); return jobId; }
async getNextJob(): Promise<QueueJob | null> { const result = await this.redis.zpopmin(this.queueName); if (!result || result.length === 0) return null;
try { const job: QueueJob = JSON.parse(result[1]); return job; } catch (error) { console.error('Failed to parse job from queue:', error); return null; } }
async requeueJob(job: QueueJob, delay: number = 0): Promise<void> { job.attempts++; const score = Date.now() + delay; await this.redis.zadd(this.queueName, score, JSON.stringify(job)); }
async getQueueSize(): Promise<number> { return await this.redis.zcard(this.queueName); }
async getJobsByUser(userId: string): Promise<QueueJob[]> { const jobs = await this.redis.zrange(this.queueName, 0, -1); return jobs .map(job => { try { return JSON.parse(job) as QueueJob; } catch { return null; } }) .filter((job): job is QueueJob => job !== null && job.data.userId === userId); }}
export const jobQueue = new JobQueue();Job Worker Implementation
import { VercelRequest, VercelResponse } from '@vercel/node';import { jobQueue, QueueJob } from '../../lib/queue/job-queue';import { firecrawlApp } from '../../lib/firecrawl';import { updateJobStatus, saveJobResults } from '../../lib/database/jobs';import { sendWebhook } from '../../lib/webhooks';
export default async function handler( req: VercelRequest, res: VercelResponse): Promise<VercelResponse> { if (req.method !== 'POST') { return res.status(405).json({ error: 'Method not allowed' }); }
try { const job = await jobQueue.getNextJob(); if (!job) { return res.status(200).json({ message: 'No jobs in queue' }); }
console.log(`🔄 Processing job: ${job.id}`); await updateJobStatus(job.data.jobId, 'running', { progress: 0 });
const result = await processJob(job);
if (result.success) { await updateJobStatus(job.data.jobId, 'completed', { progress: 100, results: result.data });
if (job.data.webhook) { await sendWebhook(job.data.webhook, { jobId: job.data.jobId, status: 'completed', results: result.data }); } } else { if (job.attempts < job.maxAttempts) { await jobQueue.requeueJob(job, 5000); // Retry after 5 seconds await updateJobStatus(job.data.jobId, 'pending', { progress: 0, error: result.error }); } else { await updateJobStatus(job.data.jobId, 'failed', { error: result.error }); } }
return res.status(200).json({ jobId: job.id, status: result.success ? 'completed' : 'failed' });
} catch (error) { console.error('Worker error:', error); return res.status(500).json({ error: 'Worker failed' }); }}
async function processJob(job: QueueJob): Promise<{ success: boolean; data?: any; error?: string }> { try { const { url, config } = job.data;
if (job.type === 'scrape') { const result = await firecrawlApp.scrapeUrl(url, { formats: config.formats, onlyMainContent: config.options.onlyMainContent, includeTags: config.options.includeTags, excludeTags: config.options.excludeTags, timeout: config.options.timeout, ...(config.extractionPrompt && { extractorOptions: { mode: 'llm-extraction' as const, extractionPrompt: config.extractionPrompt } }) });
if (result.success) { await saveJobResults(job.data.jobId, [{ url, title: result.data.metadata?.title || 'No title', content: result.data.markdown || result.data.html || '', metadata: result.data.metadata || {}, extractedData: result.data.extract, scrapedAt: new Date() }]);
return { success: true, data: result.data }; } else { return { success: false, error: result.error }; }
} else if (job.type === 'crawl') { const result = await firecrawlApp.crawlUrl(url, { limit: config.maxPages || 10, scrapeOptions: { formats: config.formats, onlyMainContent: config.options.onlyMainContent, ...(config.extractionPrompt && { extractorOptions: { mode: 'llm-extraction' as const, extractionPrompt: config.extractionPrompt } }) }, includePaths: config.includePatterns, excludePaths: config.excludePatterns });
if (result.success) { const results = result.data.map((page: any) => ({ url: page.metadata?.sourceURL || url, title: page.metadata?.title || 'No title', content: page.markdown || page.html || '', metadata: page.metadata || {}, extractedData: page.extract, scrapedAt: new Date() }));
await saveJobResults(job.data.jobId, results); return { success: true, data: results }; } else { return { success: false, error: result.error }; } }
return { success: false, error: 'Unknown job type' };
} catch (error) { return { success: false, error: error instanceof Error ? error.message : 'Unknown error' }; }}Real-time Updates with WebSockets
WebSocket Server Implementation
import { Server as SocketIOServer } from 'socket.io';import { validateApiKey } from '../auth';
export class WebSocketManager { private io: SocketIOServer; private userSockets = new Map<string, string[]>(); // userId -> socketIds
constructor(server: any) { this.io = new SocketIOServer(server, { cors: { origin: process.env.FRONTEND_URL, methods: ['GET', 'POST'] } });
this.setupEventHandlers(); }
private setupEventHandlers() { this.io.on('connection', async (socket) => { console.log(`🔌 Client connected: ${socket.id}`);
// Authenticate user const token = socket.handshake.auth.token; const user = await validateApiKey(token);
if (!user) { socket.emit('error', { message: 'Authentication failed' }); socket.disconnect(); return; }
// Track user socket const userSockets = this.userSockets.get(user.id) || []; userSockets.push(socket.id); this.userSockets.set(user.id, userSockets);
// Join user-specific room socket.join(`user:${user.id}`);
socket.on('disconnect', () => { console.log(`🔌 Client disconnected: ${socket.id}`); this.removeUserSocket(user.id, socket.id); });
socket.on('subscribe:job', (jobId: string) => { socket.join(`job:${jobId}`); });
socket.on('unsubscribe:job', (jobId: string) => { socket.leave(`job:${jobId}`); }); }); }
emitJobUpdate(jobId: string, update: any) { this.io.to(`job:${jobId}`).emit('job:update', { jobId, ...update }); }
emitUserNotification(userId: string, notification: any) { this.io.to(`user:${userId}`).emit('notification', notification); }
private removeUserSocket(userId: string, socketId: string) { const userSockets = this.userSockets.get(userId) || []; const filtered = userSockets.filter(id => id !== socketId);
if (filtered.length === 0) { this.userSockets.delete(userId); } else { this.userSockets.set(userId, filtered); } }
getConnectedUsers(): number { return this.userSockets.size; }}React Hook for Real-time Updates
import { useEffect, useState } from 'react';import { io, Socket } from 'socket.io-client';
interface JobUpdate { status: string; progress: number; results?: any[]; error?: string;}
export function useRealtimeJob(jobId: string, apiKey: string) { const [socket, setSocket] = useState<Socket | null>(null); const [jobUpdate, setJobUpdate] = useState<JobUpdate | null>(null); const [connected, setConnected] = useState(false);
useEffect(() => { const newSocket = io(process.env.NEXT_PUBLIC_WS_URL!, { auth: { token: apiKey } });
newSocket.on('connect', () => { setConnected(true); newSocket.emit('subscribe:job', jobId); });
newSocket.on('disconnect', () => { setConnected(false); });
newSocket.on('job:update', (update: JobUpdate) => { setJobUpdate(update); });
newSocket.on('error', (error) => { console.error('WebSocket error:', error); });
setSocket(newSocket);
return () => { newSocket.emit('unsubscribe:job', jobId); newSocket.disconnect(); }; }, [jobId, apiKey]);
return { jobUpdate, connected };}Advanced Dashboard Components
Job Management Interface
import { useState, useEffect } from 'react';import { useRealtimeJob } from '../hooks/useRealtimeJob';
interface Job { id: string; name: string; url: string; status: string; progress: number; createdAt: string; results?: any[];}
export function JobManager({ apiKey }: { apiKey: string }) { const [jobs, setJobs] = useState<Job[]>([]); const [selectedJob, setSelectedJob] = useState<string | null>(null);
useEffect(() => { fetchJobs(); }, []);
const fetchJobs = async () => { try { const response = await fetch('/api/jobs', { headers: { 'Authorization': `Bearer ${apiKey}` } }); const data = await response.json(); setJobs(data.jobs); } catch (error) { console.error('Failed to fetch jobs:', error); } };
const createJob = async (jobData: any) => { try { const response = await fetch('/api/jobs', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}` }, body: JSON.stringify(jobData) });
if (response.ok) { fetchJobs(); // Refresh job list } } catch (error) { console.error('Failed to create job:', error); } };
return ( <div className="grid grid-cols-1 lg:grid-cols-3 gap-6"> {/* Job List */} <div className="lg:col-span-2"> <div className="bg-white rounded-lg shadow"> <div className="p-6 border-b"> <h2 className="text-lg font-semibold">Scraping Jobs</h2> <button onClick={() => setSelectedJob('new')} className="mt-2 bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700" > Create New Job </button> </div>
<div className="divide-y"> {jobs.map((job) => ( <JobItem key={job.id} job={job} apiKey={apiKey} onSelect={() => setSelectedJob(job.id)} selected={selectedJob === job.id} /> ))} </div> </div> </div>
{/* Job Details/Creation */} <div className="lg:col-span-1"> {selectedJob === 'new' ? ( <JobCreationForm onSubmit={createJob} onCancel={() => setSelectedJob(null)} /> ) : selectedJob ? ( <JobDetails jobId={selectedJob} apiKey={apiKey} /> ) : ( <div className="bg-gray-50 rounded-lg p-6 text-center text-gray-500"> Select a job to view details </div> )} </div> </div> );}
function JobItem({ job, apiKey, onSelect, selected }: { job: Job; apiKey: string; onSelect: () => void; selected: boolean;}) { const { jobUpdate } = useRealtimeJob(job.id, apiKey); const currentStatus = jobUpdate?.status || job.status; const currentProgress = jobUpdate?.progress || job.progress;
return ( <div className={`p-4 cursor-pointer hover:bg-gray-50 ${selected ? 'bg-blue-50 border-l-4 border-blue-500' : ''}`} onClick={onSelect} > <div className="flex items-center justify-between"> <div> <h3 className="font-medium">{job.name}</h3> <p className="text-sm text-gray-500">{job.url}</p> <p className="text-xs text-gray-400"> {new Date(job.createdAt).toLocaleString()} </p> </div> <div className="text-right"> <StatusBadge status={currentStatus} /> {currentStatus === 'running' && ( <div className="mt-2"> <div className="w-20 bg-gray-200 rounded-full h-2"> <div className="bg-blue-600 h-2 rounded-full transition-all" style={{ width: `${currentProgress}%` }} /> </div> <span className="text-xs text-gray-500">{currentProgress}%</span> </div> )} </div> </div> </div> );}
function StatusBadge({ status }: { status: string }) { const colors = { pending: 'bg-yellow-100 text-yellow-800', running: 'bg-blue-100 text-blue-800', completed: 'bg-green-100 text-green-800', failed: 'bg-red-100 text-red-800', canceled: 'bg-gray-100 text-gray-800' };
return ( <span className={`px-2 py-1 text-xs font-medium rounded-full ${colors[status as keyof typeof colors] || colors.pending}`}> {status} </span> );}Subscription & Billing Integration
Stripe Integration
import Stripe from 'stripe';
export const stripe = new Stripe(process.env.STRIPE_SECRET_KEY!, { apiVersion: '2023-10-16'});
export const PLANS = { free: { name: 'Free', price: 0, credits: 500, features: ['Basic scraping', 'Email support'] }, pro: { name: 'Pro', price: 29, credits: 10000, features: ['Advanced scraping', 'Priority support', 'Webhooks', 'API access'] }, enterprise: { name: 'Enterprise', price: 99, credits: 100000, features: ['Unlimited scraping', '24/7 support', 'Custom integrations', 'SLA'] }};
export async function createCheckoutSession( userId: string, plan: keyof typeof PLANS, successUrl: string, cancelUrl: string) { const session = await stripe.checkout.sessions.create({ customer_email: undefined, // Will be filled from user data payment_method_types: ['card'], line_items: [ { price_data: { currency: 'usd', product_data: { name: `ScrapeMaster ${PLANS[plan].name}`, description: `${PLANS[plan].credits} credits per month` }, unit_amount: PLANS[plan].price * 100, recurring: { interval: 'month' } }, quantity: 1 } ], mode: 'subscription', success_url: successUrl, cancel_url: cancelUrl, metadata: { userId, plan } });
return session;}Key Takeaways
- Production applications require robust architecture with proper separation of concerns
- Job queues enable scalable processing of scraping tasks
- Real-time updates improve user experience significantly
- Proper monitoring and error handling are essential for reliability
- Subscription management adds business value to scraping services
Next Steps
In Part 6, we’ll cover best practices, advanced error handling, and optimization techniques to ensure your application runs smoothly at scale.