Advanced Scraping Techniques and Data Processing
In this part, we’ll tackle complex scraping scenarios that go beyond basic page extraction. You’ll learn to handle dynamic content, pagination, infinite scroll, and build sophisticated data processing pipelines.
Handling Dynamic Content and JavaScript-Heavy Sites
Understanding Modern Web Applications
Modern websites often load content dynamically using JavaScript frameworks like React, Vue, or Angular. FireCrawl excels at handling these scenarios by fully rendering pages before extraction.
import { firecrawlApp } from '../config/firecrawl';
export class DynamicContentScraper { async scrapeSPA(url: string, waitConditions?: { waitFor?: number; waitForSelector?: string; waitForFunction?: string; }) { const options = { formats: ['markdown'] as const, onlyMainContent: true, timeout: 60000, // Increased timeout for heavy JS sites ...waitConditions && { waitFor: waitConditions.waitFor || 5000, } };
const result = await firecrawlApp.scrapeUrl(url, options);
if (result.success) { return { content: result.data.markdown, title: result.data.metadata?.title, loadTime: result.data.metadata?.loadTime, screenshots: result.data.screenshot, }; }
return null; }
async scrapeInfiniteScroll(url: string, maxScrolls: number = 5) { const result = await firecrawlApp.scrapeUrl(url, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Extract all content items from this page that loads content on scroll. Look for repeated patterns like posts, products, articles, etc. Return as JSON array with: title, description, url, imageUrl for each item.` }, // FireCrawl handles infinite scroll automatically timeout: 90000, });
return result.success ? result.data.extract : []; }}Advanced Pagination Handling
Multi-Page Data Collection
import { firecrawlApp } from '../config/firecrawl';import { delay } from '../utils/helpers';
export interface PaginatedResult<T> { data: T[]; totalPages: number; currentPage: number; hasNextPage: boolean;}
export class PaginationScraper { async scrapeAllPages<T>( baseUrl: string, extractionPrompt: string, options: { maxPages?: number; pageParam?: string; startPage?: number; delayBetweenPages?: number; } = {} ): Promise<T[]> { const { maxPages = 10, pageParam = 'page', startPage = 1, delayBetweenPages = 2000 } = options;
const allData: T[] = []; let currentPage = startPage; let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) { const pageUrl = this.buildPageUrl(baseUrl, pageParam, currentPage); console.log(`📄 Scraping page ${currentPage}: ${pageUrl}`);
try { const result = await firecrawlApp.scrapeUrl(pageUrl, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: extractionPrompt + ` Also determine if there's a next page available. Return: { items: [...], hasNextPage: boolean }` } });
if (result.success && result.data.extract) { const pageData = result.data.extract;
if (Array.isArray(pageData.items)) { allData.push(...pageData.items); }
hasNextPage = pageData.hasNextPage !== false;
if (pageData.items?.length === 0) { hasNextPage = false; } } else { hasNextPage = false; }
currentPage++;
if (hasNextPage && delayBetweenPages > 0) { await delay(delayBetweenPages); }
} catch (error) { console.error(`❌ Error scraping page ${currentPage}:`, error); hasNextPage = false; } }
console.log(`✅ Scraped ${allData.length} items from ${currentPage - 1} pages`); return allData; }
private buildPageUrl(baseUrl: string, pageParam: string, pageNumber: number): string { const url = new URL(baseUrl); url.searchParams.set(pageParam, pageNumber.toString()); return url.toString(); }
async scrapeEcommerceCatalog(baseUrl: string, maxPages: number = 20) { return this.scrapeAllPages(baseUrl, ` Extract all products from this e-commerce page: For each product: name, price, imageUrl, productUrl, rating, availability `, { maxPages }); }
async scrapeJobListings(baseUrl: string, maxPages: number = 15) { return this.scrapeAllPages(baseUrl, ` Extract all job listings from this page: For each job: title, company, location, salary, jobUrl, postedDate, jobType `, { maxPages }); }}Sophisticated Data Processing
Advanced Data Transformation Pipeline
import { saveToFile } from '../utils/helpers';
export interface ProcessingStep<T, R> { name: string; process: (data: T[]) => Promise<R[]>;}
export class DataPipeline<T> { private steps: ProcessingStep<any, any>[] = [];
addStep<R>(step: ProcessingStep<T, R>): DataPipeline<R> { this.steps.push(step); return this as any; }
async execute(initialData: T[]): Promise<any[]> { let currentData = initialData;
for (const step of this.steps) { console.log(`🔄 Executing step: ${step.name}`); currentData = await step.process(currentData); console.log(`✅ Step completed. Items: ${currentData.length}`); }
return currentData; }}
// Data processing utilitiesexport class DataProcessor { static deduplication<T>(keyExtractor: (item: T) => string): ProcessingStep<T, T> { return { name: 'Deduplication', process: async (data: T[]) => { const seen = new Set<string>(); return data.filter(item => { const key = keyExtractor(item); if (seen.has(key)) return false; seen.add(key); return true; }); } }; }
static validation<T>(validator: (item: T) => boolean): ProcessingStep<T, T> { return { name: 'Validation', process: async (data: T[]) => { return data.filter(validator); } }; }
static enrichment<T, R>(enricher: (item: T) => Promise<R>): ProcessingStep<T, R> { return { name: 'Data Enrichment', process: async (data: T[]) => { const enriched: R[] = []; for (const item of data) { try { const enrichedItem = await enricher(item); enriched.push(enrichedItem); } catch (error) { console.error('Enrichment failed for item:', error); } } return enriched; } }; }
static transformation<T, R>(transformer: (item: T) => R): ProcessingStep<T, R> { return { name: 'Data Transformation', process: async (data: T[]) => { return data.map(transformer); } }; }
static aggregation<T, R>(aggregator: (data: T[]) => R[]): ProcessingStep<T, R> { return { name: 'Data Aggregation', process: async (data: T[]) => { return aggregator(data); } }; }}Real-World Advanced Examples
E-commerce Intelligence System
import { PaginationScraper } from '../scrapers/pagination-scraper';import { DataPipeline, DataProcessor } from '../processors/data-pipeline';import { firecrawlApp } from '../config/firecrawl';
export interface Product { name: string; price: number; originalPrice?: number; discount?: number; rating: number; reviewCount: number; availability: string; category: string; brand: string; imageUrl: string; productUrl: string; features: string[]; scrapedAt: string;}
export interface EnrichedProduct extends Product { priceHistory?: number[]; competitorPrices?: { site: string; price: number }[]; sentiment?: 'positive' | 'negative' | 'neutral'; popularityScore?: number;}
export class EcommerceIntelligence { private paginationScraper = new PaginationScraper();
async analyzeProductCategory(categoryUrl: string): Promise<EnrichedProduct[]> { // Step 1: Scrape all products from category const rawProducts = await this.paginationScraper.scrapeAllPages<Product>( categoryUrl, `Extract all products with: name, price, rating, reviewCount, availability, brand, imageUrl, productUrl, features array`, { maxPages: 50 } );
// Step 2: Process data through pipeline const pipeline = new DataPipeline<Product>() .addStep(DataProcessor.deduplication(p => p.productUrl)) .addStep(DataProcessor.validation(this.isValidProduct)) .addStep(DataProcessor.transformation(this.normalizeProduct)) .addStep(DataProcessor.enrichment(this.enrichProduct));
const processedProducts = await pipeline.execute(rawProducts);
// Step 3: Generate insights const insights = this.generateCategoryInsights(processedProducts); console.log('📊 Category Analysis:', insights);
return processedProducts; }
private isValidProduct(product: Product): boolean { return !!(product.name && product.price && product.productUrl); }
private normalizeProduct(product: Product): Product { return { ...product, name: product.name.trim(), price: typeof product.price === 'string' ? parseFloat(product.price.replace(/[^0-9.]/g, '')) : product.price, rating: Math.min(5, Math.max(0, product.rating || 0)), scrapedAt: new Date().toISOString(), }; }
private async enrichProduct(product: Product): Promise<EnrichedProduct> { const enriched: EnrichedProduct = { ...product };
// Add competitor price checking enriched.competitorPrices = await this.getCompetitorPrices(product.name);
// Calculate popularity score enriched.popularityScore = this.calculatePopularityScore(product);
// Analyze sentiment from reviews enriched.sentiment = await this.analyzeSentiment(product.productUrl);
return enriched; }
private async getCompetitorPrices(productName: string): Promise<{ site: string; price: number }[]> { const competitors = [ 'https://amazon.com/s?k=', 'https://ebay.com/sch/', ];
const prices = []; for (const baseUrl of competitors) { try { const searchUrl = baseUrl + encodeURIComponent(productName); const result = await firecrawlApp.scrapeUrl(searchUrl, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Find the lowest price for "${productName}" on this page. Return just the numeric price.` } });
if (result.success && result.data.extract) { const price = parseFloat(result.data.extract.toString()); if (!isNaN(price)) { prices.push({ site: new URL(baseUrl).hostname, price }); } } } catch (error) { console.error(`Failed to get competitor price from ${baseUrl}:`, error); } }
return prices; }
private calculatePopularityScore(product: Product): number { const ratingWeight = 0.4; const reviewCountWeight = 0.6;
const normalizedRating = product.rating / 5; const normalizedReviews = Math.min(product.reviewCount / 1000, 1);
return (normalizedRating * ratingWeight + normalizedReviews * reviewCountWeight) * 100; }
private async analyzeSentiment(productUrl: string): Promise<'positive' | 'negative' | 'neutral'> { try { const result = await firecrawlApp.scrapeUrl(productUrl, { formats: ['markdown'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Analyze the overall sentiment of customer reviews on this product page. Consider ratings, review text, and overall tone. Return one word: "positive", "negative", or "neutral"` } });
const sentiment = result.data?.extract?.toString().toLowerCase(); if (['positive', 'negative', 'neutral'].includes(sentiment)) { return sentiment as 'positive' | 'negative' | 'neutral'; } } catch (error) { console.error('Sentiment analysis failed:', error); }
return 'neutral'; }
private generateCategoryInsights(products: EnrichedProduct[]) { return { totalProducts: products.length, averagePrice: products.reduce((sum, p) => sum + p.price, 0) / products.length, averageRating: products.reduce((sum, p) => sum + p.rating, 0) / products.length, topBrands: this.getTopBrands(products), priceRanges: this.getPriceDistribution(products), availabilityStats: this.getAvailabilityStats(products), }; }
private getTopBrands(products: EnrichedProduct[]) { const brandCounts = products.reduce((acc, p) => { acc[p.brand] = (acc[p.brand] || 0) + 1; return acc; }, {} as Record<string, number>);
return Object.entries(brandCounts) .sort(([,a], [,b]) => b - a) .slice(0, 10) .map(([brand, count]) => ({ brand, count })); }
private getPriceDistribution(products: EnrichedProduct[]) { const ranges = [ { min: 0, max: 50, label: '$0-50' }, { min: 50, max: 100, label: '$50-100' }, { min: 100, max: 200, label: '$100-200' }, { min: 200, max: 500, label: '$200-500' }, { min: 500, max: Infinity, label: '$500+' }, ];
return ranges.map(range => ({ ...range, count: products.filter(p => p.price >= range.min && p.price < range.max).length })); }
private getAvailabilityStats(products: EnrichedProduct[]) { const stats = products.reduce((acc, p) => { acc[p.availability] = (acc[p.availability] || 0) + 1; return acc; }, {} as Record<string, number>);
return Object.entries(stats).map(([status, count]) => ({ status, count })); }}Advanced Content Analysis
Content Intelligence Engine
import { firecrawlApp } from '../config/firecrawl';
export interface ContentAnalysis { readabilityScore: number; keywordDensity: Record<string, number>; topicCategories: string[]; sentiment: 'positive' | 'negative' | 'neutral'; contentStructure: { headingCount: number; paragraphCount: number; linkCount: number; imageCount: number; }; seoMetrics: { titleLength: number; metaDescriptionLength: number; h1Count: number; altTextCoverage: number; };}
export class ContentAnalyzer { async analyzeContent(url: string): Promise<ContentAnalysis | null> { const result = await firecrawlApp.scrapeUrl(url, { formats: ['markdown', 'html'], extractorOptions: { mode: 'llm-extraction', extractionPrompt: `Analyze this content comprehensively: 1. Readability score (1-100, higher is more readable) 2. Top 10 keywords with frequency 3. Main topic categories 4. Overall sentiment 5. Content structure (headings, paragraphs, links, images count) 6. SEO metrics (title length, meta description length, H1 count)
Return as structured JSON with all metrics.` } });
if (!result.success) return null;
const analysis = result.data.extract; const markdown = result.data.markdown || ''; const html = result.data.html || '';
return { readabilityScore: analysis?.readabilityScore || this.calculateReadability(markdown), keywordDensity: analysis?.keywordDensity || this.extractKeywords(markdown), topicCategories: analysis?.topicCategories || [], sentiment: analysis?.sentiment || 'neutral', contentStructure: this.analyzeStructure(markdown), seoMetrics: this.analyzeSEO(html, result.data.metadata), }; }
private calculateReadability(text: string): number { const sentences = text.split(/[.!?]+/).length; const words = text.split(/\s+/).length; const syllables = this.countSyllables(text);
// Simplified Flesch Reading Ease formula const score = 206.835 - (1.015 * (words / sentences)) - (84.6 * (syllables / words)); return Math.max(0, Math.min(100, score)); }
private countSyllables(text: string): number { return text.toLowerCase() .replace(/[^a-z]/g, '') .replace(/[aeiouy]+/g, 'a') .replace(/a$/, '') .length || 1; }
private extractKeywords(text: string): Record<string, number> { const words = text.toLowerCase() .replace(/[^\w\s]/g, '') .split(/\s+/) .filter(word => word.length > 3);
const frequency: Record<string, number> = {}; words.forEach(word => { frequency[word] = (frequency[word] || 0) + 1; });
return Object.fromEntries( Object.entries(frequency) .sort(([,a], [,b]) => b - a) .slice(0, 10) ); }
private analyzeStructure(markdown: string) { return { headingCount: (markdown.match(/^#+\s/gm) || []).length, paragraphCount: markdown.split('\n\n').length, linkCount: (markdown.match(/\[.*?\]\(.*?\)/g) || []).length, imageCount: (markdown.match(/!\[.*?\]\(.*?\)/g) || []).length, }; }
private analyzeSEO(html: string, metadata: any) { return { titleLength: metadata?.title?.length || 0, metaDescriptionLength: metadata?.description?.length || 0, h1Count: (html.match(/<h1[^>]*>/gi) || []).length, altTextCoverage: this.calculateAltTextCoverage(html), }; }
private calculateAltTextCoverage(html: string): number { const images = html.match(/<img[^>]*>/gi) || []; const imagesWithAlt = images.filter(img => /alt\s*=\s*["'][^"']*["']/i.test(img)); return images.length > 0 ? (imagesWithAlt.length / images.length) * 100 : 0; }}Performance Optimization Strategies
Intelligent Caching System
import fs from 'fs/promises';import path from 'path';import crypto from 'crypto';
export interface CacheEntry<T> { data: T; timestamp: number; ttl: number; url: string;}
export class CacheManager { private cacheDir: string;
constructor(cacheDir: string = './cache') { this.cacheDir = cacheDir; }
async get<T>(key: string): Promise<T | null> { try { const filePath = this.getCacheFilePath(key); const content = await fs.readFile(filePath, 'utf-8'); const entry: CacheEntry<T> = JSON.parse(content);
if (Date.now() - entry.timestamp > entry.ttl) { await this.delete(key); return null; }
return entry.data; } catch { return null; } }
async set<T>(key: string, data: T, ttl: number = 3600000, url?: string): Promise<void> { await fs.mkdir(this.cacheDir, { recursive: true });
const entry: CacheEntry<T> = { data, timestamp: Date.now(), ttl, url: url || '', };
const filePath = this.getCacheFilePath(key); await fs.writeFile(filePath, JSON.stringify(entry, null, 2)); }
async delete(key: string): Promise<void> { try { const filePath = this.getCacheFilePath(key); await fs.unlink(filePath); } catch { // File doesn't exist, ignore } }
generateKey(url: string, options?: any): string { const combined = url + JSON.stringify(options || {}); return crypto.createHash('md5').update(combined).digest('hex'); }
private getCacheFilePath(key: string): string { return path.join(this.cacheDir, `${key}.json`); }}
// Enhanced scraper with cachingexport class CachedScraper { private cache = new CacheManager();
async scrapeWithCache(url: string, options: any = {}, ttl: number = 3600000) { const cacheKey = this.cache.generateKey(url, options);
// Try cache first const cached = await this.cache.get(cacheKey); if (cached) { console.log(`📦 Cache hit for: ${url}`); return cached; }
// Scrape and cache console.log(`🔍 Cache miss, scraping: ${url}`); const result = await firecrawlApp.scrapeUrl(url, options);
if (result.success) { await this.cache.set(cacheKey, result.data, ttl, url); }
return result.success ? result.data : null; }}Key Takeaways
- FireCrawl handles JavaScript-heavy sites and dynamic content automatically
- Use structured extraction prompts for complex data parsing
- Implement robust data processing pipelines for production systems
- Cache results to improve performance and reduce API costs
- Build comprehensive analysis tools for content intelligence
- Always validate and clean scraped data before processing
Next Steps
In Part 4, we’ll integrate our scraping capabilities with Vercel SDK to build deployable applications:
- Setting up Vercel serverless functions
- Creating API endpoints for scraping services
- Building web interfaces for scraping tools
- Implementing authentication and rate limiting
- Deploying production-ready scraping applications
You now have advanced scraping capabilities that can handle complex real-world scenarios!