Advanced Scraping Techniques and Data Processing

In this part, we’ll tackle complex scraping scenarios that go beyond basic page extraction. You’ll learn to handle dynamic content, pagination, infinite scroll, and build sophisticated data processing pipelines.

Handling Dynamic Content and JavaScript-Heavy Sites

Understanding Modern Web Applications

Modern websites often load content dynamically using JavaScript frameworks like React, Vue, or Angular. FireCrawl excels at handling these scenarios by fully rendering pages before extraction.

import { firecrawlApp } from '../config/firecrawl';

export class DynamicContentScraper {
  async scrapeSPA(url: string, waitConditions?: {
    waitFor?: number;
    waitForSelector?: string;
    waitForFunction?: string;
  }) {
    const options = {
      formats: ['markdown'] as const,
      onlyMainContent: true,
      timeout: 60000, // Increased timeout for heavy JS sites
      ...waitConditions && {
        waitFor: waitConditions.waitFor || 5000,
      }
    };

    const result = await firecrawlApp.scrapeUrl(url, options);

    if (result.success) {
      return {
        content: result.data.markdown,
        title: result.data.metadata?.title,
        loadTime: result.data.metadata?.loadTime,
        screenshots: result.data.screenshot,
      };
    }

    return null;
  }

  async scrapeInfiniteScroll(url: string, maxScrolls: number = 5) {
    const result = await firecrawlApp.scrapeUrl(url, {
      formats: ['markdown'],
      extractorOptions: {
        mode: 'llm-extraction',
        extractionPrompt: `Extract all content items from this page that loads content on scroll.
        Look for repeated patterns like posts, products, articles, etc.
        Return as JSON array with: title, description, url, imageUrl for each item.`
      },
      // FireCrawl handles infinite scroll automatically
      timeout: 90000,
    });

    return result.success ? result.data.extract : [];
  }
}

Advanced Pagination Handling

Multi-Page Data Collection

import { firecrawlApp } from '../config/firecrawl';
import { delay } from '../utils/helpers';

export interface PaginatedResult<T> {
  data: T[];
  totalPages: number;
  currentPage: number;
  hasNextPage: boolean;
}

export class PaginationScraper {
  async scrapeAllPages<T>(
    baseUrl: string,
    extractionPrompt: string,
    options: {
      maxPages?: number;
      pageParam?: string;
      startPage?: number;
      delayBetweenPages?: number;
    } = {}
  ): Promise<T[]> {
    const {
      maxPages = 10,
      pageParam = 'page',
      startPage = 1,
      delayBetweenPages = 2000
    } = options;

    const allData: T[] = [];
    let currentPage = startPage;
    let hasNextPage = true;

    while (hasNextPage && currentPage <= maxPages) {
      const pageUrl = this.buildPageUrl(baseUrl, pageParam, currentPage);
      console.log(`📄 Scraping page ${currentPage}: ${pageUrl}`);

      try {
        const result = await firecrawlApp.scrapeUrl(pageUrl, {
          formats: ['markdown'],
          extractorOptions: {
            mode: 'llm-extraction',
            extractionPrompt: extractionPrompt + `
            Also determine if there's a next page available.
            Return: { items: [...], hasNextPage: boolean }`
          }
        });

        if (result.success && result.data.extract) {
          const pageData = result.data.extract;

          if (Array.isArray(pageData.items)) {
            allData.push(...pageData.items);
          }

          hasNextPage = pageData.hasNextPage !== false;

          if (pageData.items?.length === 0) {
            hasNextPage = false;
          }
        } else {
          hasNextPage = false;
        }

        currentPage++;

        if (hasNextPage && delayBetweenPages > 0) {
          await delay(delayBetweenPages);
        }

      } catch (error) {
        console.error(`❌ Error scraping page ${currentPage}:`, error);
        hasNextPage = false;
      }
    }

    console.log(`✅ Scraped ${allData.length} items from ${currentPage - 1} pages`);
    return allData;
  }

  private buildPageUrl(baseUrl: string, pageParam: string, pageNumber: number): string {
    const url = new URL(baseUrl);
    url.searchParams.set(pageParam, pageNumber.toString());
    return url.toString();
  }

  async scrapeEcommerceCatalog(baseUrl: string, maxPages: number = 20) {
    return this.scrapeAllPages(baseUrl, `
      Extract all products from this e-commerce page:
      For each product: name, price, imageUrl, productUrl, rating, availability
    `, { maxPages });
  }

  async scrapeJobListings(baseUrl: string, maxPages: number = 15) {
    return this.scrapeAllPages(baseUrl, `
      Extract all job listings from this page:
      For each job: title, company, location, salary, jobUrl, postedDate, jobType
    `, { maxPages });
  }
}

Sophisticated Data Processing

Advanced Data Transformation Pipeline

import { saveToFile } from '../utils/helpers';

export interface ProcessingStep<T, R> {
  name: string;
  process: (data: T[]) => Promise<R[]>;
}

export class DataPipeline<T> {
  private steps: ProcessingStep<any, any>[] = [];

  addStep<R>(step: ProcessingStep<T, R>): DataPipeline<R> {
    this.steps.push(step);
    return this as any;
  }

  async execute(initialData: T[]): Promise<any[]> {
    let currentData = initialData;

    for (const step of this.steps) {
      console.log(`🔄 Executing step: ${step.name}`);
      currentData = await step.process(currentData);
      console.log(`✅ Step completed. Items: ${currentData.length}`);
    }

    return currentData;
  }
}

// Data processing utilities
export class DataProcessor {
  static deduplication<T>(keyExtractor: (item: T) => string): ProcessingStep<T, T> {
    return {
      name: 'Deduplication',
      process: async (data: T[]) => {
        const seen = new Set<string>();
        return data.filter(item => {
          const key = keyExtractor(item);
          if (seen.has(key)) return false;
          seen.add(key);
          return true;
        });
      }
    };
  }

  static validation<T>(validator: (item: T) => boolean): ProcessingStep<T, T> {
    return {
      name: 'Validation',
      process: async (data: T[]) => {
        return data.filter(validator);
      }
    };
  }

  static enrichment<T, R>(enricher: (item: T) => Promise<R>): ProcessingStep<T, R> {
    return {
      name: 'Data Enrichment',
      process: async (data: T[]) => {
        const enriched: R[] = [];
        for (const item of data) {
          try {
            const enrichedItem = await enricher(item);
            enriched.push(enrichedItem);
          } catch (error) {
            console.error('Enrichment failed for item:', error);
          }
        }
        return enriched;
      }
    };
  }

  static transformation<T, R>(transformer: (item: T) => R): ProcessingStep<T, R> {
    return {
      name: 'Data Transformation',
      process: async (data: T[]) => {
        return data.map(transformer);
      }
    };
  }

  static aggregation<T, R>(aggregator: (data: T[]) => R[]): ProcessingStep<T, R> {
    return {
      name: 'Data Aggregation',
      process: async (data: T[]) => {
        return aggregator(data);
      }
    };
  }
}

Real-World Advanced Examples

E-commerce Intelligence System

import { PaginationScraper } from '../scrapers/pagination-scraper';
import { DataPipeline, DataProcessor } from '../processors/data-pipeline';
import { firecrawlApp } from '../config/firecrawl';

export interface Product {
  name: string;
  price: number;
  originalPrice?: number;
  discount?: number;
  rating: number;
  reviewCount: number;
  availability: string;
  category: string;
  brand: string;
  imageUrl: string;
  productUrl: string;
  features: string[];
  scrapedAt: string;
}

export interface EnrichedProduct extends Product {
  priceHistory?: number[];
  competitorPrices?: { site: string; price: number }[];
  sentiment?: 'positive' | 'negative' | 'neutral';
  popularityScore?: number;
}

export class EcommerceIntelligence {
  private paginationScraper = new PaginationScraper();

  async analyzeProductCategory(categoryUrl: string): Promise<EnrichedProduct[]> {
    // Step 1: Scrape all products from category
    const rawProducts = await this.paginationScraper.scrapeAllPages<Product>(
      categoryUrl,
      `Extract all products with: name, price, rating, reviewCount, availability,
       brand, imageUrl, productUrl, features array`,
      { maxPages: 50 }
    );

    // Step 2: Process data through pipeline
    const pipeline = new DataPipeline<Product>()
      .addStep(DataProcessor.deduplication(p => p.productUrl))
      .addStep(DataProcessor.validation(this.isValidProduct))
      .addStep(DataProcessor.transformation(this.normalizeProduct))
      .addStep(DataProcessor.enrichment(this.enrichProduct));

    const processedProducts = await pipeline.execute(rawProducts);

    // Step 3: Generate insights
    const insights = this.generateCategoryInsights(processedProducts);
    console.log('📊 Category Analysis:', insights);

    return processedProducts;
  }

  private isValidProduct(product: Product): boolean {
    return !!(product.name && product.price && product.productUrl);
  }

  private normalizeProduct(product: Product): Product {
    return {
      ...product,
      name: product.name.trim(),
      price: typeof product.price === 'string'
        ? parseFloat(product.price.replace(/[^0-9.]/g, ''))
        : product.price,
      rating: Math.min(5, Math.max(0, product.rating || 0)),
      scrapedAt: new Date().toISOString(),
    };
  }

  private async enrichProduct(product: Product): Promise<EnrichedProduct> {
    const enriched: EnrichedProduct = { ...product };

    // Add competitor price checking
    enriched.competitorPrices = await this.getCompetitorPrices(product.name);

    // Calculate popularity score
    enriched.popularityScore = this.calculatePopularityScore(product);

    // Analyze sentiment from reviews
    enriched.sentiment = await this.analyzeSentiment(product.productUrl);

    return enriched;
  }

  private async getCompetitorPrices(productName: string): Promise<{ site: string; price: number }[]> {
    const competitors = [
      'https://amazon.com/s?k=',
      'https://ebay.com/sch/',
    ];

    const prices = [];
    for (const baseUrl of competitors) {
      try {
        const searchUrl = baseUrl + encodeURIComponent(productName);
        const result = await firecrawlApp.scrapeUrl(searchUrl, {
          formats: ['markdown'],
          extractorOptions: {
            mode: 'llm-extraction',
            extractionPrompt: `Find the lowest price for "${productName}" on this page. Return just the numeric price.`
          }
        });

        if (result.success && result.data.extract) {
          const price = parseFloat(result.data.extract.toString());
          if (!isNaN(price)) {
            prices.push({
              site: new URL(baseUrl).hostname,
              price
            });
          }
        }
      } catch (error) {
        console.error(`Failed to get competitor price from ${baseUrl}:`, error);
      }
    }

    return prices;
  }

  private calculatePopularityScore(product: Product): number {
    const ratingWeight = 0.4;
    const reviewCountWeight = 0.6;

    const normalizedRating = product.rating / 5;
    const normalizedReviews = Math.min(product.reviewCount / 1000, 1);

    return (normalizedRating * ratingWeight + normalizedReviews * reviewCountWeight) * 100;
  }

  private async analyzeSentiment(productUrl: string): Promise<'positive' | 'negative' | 'neutral'> {
    try {
      const result = await firecrawlApp.scrapeUrl(productUrl, {
        formats: ['markdown'],
        extractorOptions: {
          mode: 'llm-extraction',
          extractionPrompt: `Analyze the overall sentiment of customer reviews on this product page.
          Consider ratings, review text, and overall tone.
          Return one word: "positive", "negative", or "neutral"`
        }
      });

      const sentiment = result.data?.extract?.toString().toLowerCase();
      if (['positive', 'negative', 'neutral'].includes(sentiment)) {
        return sentiment as 'positive' | 'negative' | 'neutral';
      }
    } catch (error) {
      console.error('Sentiment analysis failed:', error);
    }

    return 'neutral';
  }

  private generateCategoryInsights(products: EnrichedProduct[]) {
    return {
      totalProducts: products.length,
      averagePrice: products.reduce((sum, p) => sum + p.price, 0) / products.length,
      averageRating: products.reduce((sum, p) => sum + p.rating, 0) / products.length,
      topBrands: this.getTopBrands(products),
      priceRanges: this.getPriceDistribution(products),
      availabilityStats: this.getAvailabilityStats(products),
    };
  }

  private getTopBrands(products: EnrichedProduct[]) {
    const brandCounts = products.reduce((acc, p) => {
      acc[p.brand] = (acc[p.brand] || 0) + 1;
      return acc;
    }, {} as Record<string, number>);

    return Object.entries(brandCounts)
      .sort(([,a], [,b]) => b - a)
      .slice(0, 10)
      .map(([brand, count]) => ({ brand, count }));
  }

  private getPriceDistribution(products: EnrichedProduct[]) {
    const ranges = [
      { min: 0, max: 50, label: '$0-50' },
      { min: 50, max: 100, label: '$50-100' },
      { min: 100, max: 200, label: '$100-200' },
      { min: 200, max: 500, label: '$200-500' },
      { min: 500, max: Infinity, label: '$500+' },
    ];

    return ranges.map(range => ({
      ...range,
      count: products.filter(p => p.price >= range.min && p.price < range.max).length
    }));
  }

  private getAvailabilityStats(products: EnrichedProduct[]) {
    const stats = products.reduce((acc, p) => {
      acc[p.availability] = (acc[p.availability] || 0) + 1;
      return acc;
    }, {} as Record<string, number>);

    return Object.entries(stats).map(([status, count]) => ({ status, count }));
  }
}

Advanced Content Analysis

Content Intelligence Engine

import { firecrawlApp } from '../config/firecrawl';

export interface ContentAnalysis {
  readabilityScore: number;
  keywordDensity: Record<string, number>;
  topicCategories: string[];
  sentiment: 'positive' | 'negative' | 'neutral';
  contentStructure: {
    headingCount: number;
    paragraphCount: number;
    linkCount: number;
    imageCount: number;
  };
  seoMetrics: {
    titleLength: number;
    metaDescriptionLength: number;
    h1Count: number;
    altTextCoverage: number;
  };
}

export class ContentAnalyzer {
  async analyzeContent(url: string): Promise<ContentAnalysis | null> {
    const result = await firecrawlApp.scrapeUrl(url, {
      formats: ['markdown', 'html'],
      extractorOptions: {
        mode: 'llm-extraction',
        extractionPrompt: `Analyze this content comprehensively:
        1. Readability score (1-100, higher is more readable)
        2. Top 10 keywords with frequency
        3. Main topic categories
        4. Overall sentiment
        5. Content structure (headings, paragraphs, links, images count)
        6. SEO metrics (title length, meta description length, H1 count)

        Return as structured JSON with all metrics.`
      }
    });

    if (!result.success) return null;

    const analysis = result.data.extract;
    const markdown = result.data.markdown || '';
    const html = result.data.html || '';

    return {
      readabilityScore: analysis?.readabilityScore || this.calculateReadability(markdown),
      keywordDensity: analysis?.keywordDensity || this.extractKeywords(markdown),
      topicCategories: analysis?.topicCategories || [],
      sentiment: analysis?.sentiment || 'neutral',
      contentStructure: this.analyzeStructure(markdown),
      seoMetrics: this.analyzeSEO(html, result.data.metadata),
    };
  }

  private calculateReadability(text: string): number {
    const sentences = text.split(/[.!?]+/).length;
    const words = text.split(/\s+/).length;
    const syllables = this.countSyllables(text);

    // Simplified Flesch Reading Ease formula
    const score = 206.835 - (1.015 * (words / sentences)) - (84.6 * (syllables / words));
    return Math.max(0, Math.min(100, score));
  }

  private countSyllables(text: string): number {
    return text.toLowerCase()
      .replace(/[^a-z]/g, '')
      .replace(/[aeiouy]+/g, 'a')
      .replace(/a$/, '')
      .length || 1;
  }

  private extractKeywords(text: string): Record<string, number> {
    const words = text.toLowerCase()
      .replace(/[^\w\s]/g, '')
      .split(/\s+/)
      .filter(word => word.length > 3);

    const frequency: Record<string, number> = {};
    words.forEach(word => {
      frequency[word] = (frequency[word] || 0) + 1;
    });

    return Object.fromEntries(
      Object.entries(frequency)
        .sort(([,a], [,b]) => b - a)
        .slice(0, 10)
    );
  }

  private analyzeStructure(markdown: string) {
    return {
      headingCount: (markdown.match(/^#+\s/gm) || []).length,
      paragraphCount: markdown.split('\n\n').length,
      linkCount: (markdown.match(/\[.*?\]\(.*?\)/g) || []).length,
      imageCount: (markdown.match(/!\[.*?\]\(.*?\)/g) || []).length,
    };
  }

  private analyzeSEO(html: string, metadata: any) {
    return {
      titleLength: metadata?.title?.length || 0,
      metaDescriptionLength: metadata?.description?.length || 0,
      h1Count: (html.match(/<h1[^>]*>/gi) || []).length,
      altTextCoverage: this.calculateAltTextCoverage(html),
    };
  }

  private calculateAltTextCoverage(html: string): number {
    const images = html.match(/<img[^>]*>/gi) || [];
    const imagesWithAlt = images.filter(img => /alt\s*=\s*["'][^"']*["']/i.test(img));
    return images.length > 0 ? (imagesWithAlt.length / images.length) * 100 : 0;
  }
}

Performance Optimization Strategies

Intelligent Caching System

import fs from 'fs/promises';
import path from 'path';
import crypto from 'crypto';

export interface CacheEntry<T> {
  data: T;
  timestamp: number;
  ttl: number;
  url: string;
}

export class CacheManager {
  private cacheDir: string;

  constructor(cacheDir: string = './cache') {
    this.cacheDir = cacheDir;
  }

  async get<T>(key: string): Promise<T | null> {
    try {
      const filePath = this.getCacheFilePath(key);
      const content = await fs.readFile(filePath, 'utf-8');
      const entry: CacheEntry<T> = JSON.parse(content);

      if (Date.now() - entry.timestamp > entry.ttl) {
        await this.delete(key);
        return null;
      }

      return entry.data;
    } catch {
      return null;
    }
  }

  async set<T>(key: string, data: T, ttl: number = 3600000, url?: string): Promise<void> {
    await fs.mkdir(this.cacheDir, { recursive: true });

    const entry: CacheEntry<T> = {
      data,
      timestamp: Date.now(),
      ttl,
      url: url || '',
    };

    const filePath = this.getCacheFilePath(key);
    await fs.writeFile(filePath, JSON.stringify(entry, null, 2));
  }

  async delete(key: string): Promise<void> {
    try {
      const filePath = this.getCacheFilePath(key);
      await fs.unlink(filePath);
    } catch {
      // File doesn't exist, ignore
    }
  }

  generateKey(url: string, options?: any): string {
    const combined = url + JSON.stringify(options || {});
    return crypto.createHash('md5').update(combined).digest('hex');
  }

  private getCacheFilePath(key: string): string {
    return path.join(this.cacheDir, `${key}.json`);
  }
}

// Enhanced scraper with caching
export class CachedScraper {
  private cache = new CacheManager();

  async scrapeWithCache(url: string, options: any = {}, ttl: number = 3600000) {
    const cacheKey = this.cache.generateKey(url, options);

    // Try cache first
    const cached = await this.cache.get(cacheKey);
    if (cached) {
      console.log(`📦 Cache hit for: ${url}`);
      return cached;
    }

    // Scrape and cache
    console.log(`🔍 Cache miss, scraping: ${url}`);
    const result = await firecrawlApp.scrapeUrl(url, options);

    if (result.success) {
      await this.cache.set(cacheKey, result.data, ttl, url);
    }

    return result.success ? result.data : null;
  }
}

Key Takeaways

FireCrawl handles JavaScript-heavy sites and dynamic content automatically
Use structured extraction prompts for complex data parsing
Implement robust data processing pipelines for production systems
Cache results to improve performance and reduce API costs
Build comprehensive analysis tools for content intelligence
Always validate and clean scraped data before processing

Next Steps

In Part 4, we’ll integrate our scraping capabilities with Vercel SDK to build deployable applications:

Setting up Vercel serverless functions
Creating API endpoints for scraping services
Building web interfaces for scraping tools
Implementing authentication and rate limiting
Deploying production-ready scraping applications

You now have advanced scraping capabilities that can handle complex real-world scenarios!