How to use Cheerio for web scraping and data extraction?

Cheerio excels in web scraping and data extraction due to its lightweight, fast, and easy-to-use nature. Here's a complete guide to using Cheerio for web scraping:

1. Basic Scraper Architecture

javascript
const cheerio = require('cheerio');
const axios = require('axios');

async function scrapePage(url) {
  try {
    // 1. Send HTTP request to get HTML
    const response = await axios.get(url, {
      headers: {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
      }
    });
    
    // 2. Load HTML with Cheerio
    const $ = cheerio.load(response.data);
    
    // 3. Extract data
    const data = {
      title: $('title').text(),
      description: $('meta[name="description"]').attr('content'),
      links: []
    };
    
    // 4. Extract all links
    $('a[href]').each((index, element) => {
      data.links.push({
        text: $(element).text().trim(),
        href: $(element).attr('href')
      });
    });
    
    return data;
  } catch (error) {
    console.error('Scraping failed:', error.message);
    throw error;
  }
}

2. News Website Scraping Example

javascript
async function scrapeNews() {
  const url = 'https://example-news.com';
  const response = await axios.get(url);
  const $ = cheerio.load(response.data);
  
  const articles = [];
  
  $('.news-item').each((index, element) => {
    const $item = $(element);
    articles.push({
      title: $item.find('.title').text().trim(),
      link: $item.find('a').attr('href'),
      summary: $item.find('.summary').text().trim(),
      date: $item.find('.date').text().trim(),
      author: $item.find('.author').text().trim()
    });
  });
  
  return articles;
}

3. E-commerce Product Scraping

javascript
async function scrapeProducts() {
  const url = 'https://example-shop.com/products';
  const response = await axios.get(url);
  const $ = cheerio.load(response.data);
  
  const products = [];
  
  $('.product-card').each((index, element) => {
    const $product = $(element);
    const priceText = $product.find('.price').text();
    const price = parseFloat(priceText.replace(/[^0-9.]/g, ''));
    
    products.push({
      name: $product.find('.product-name').text().trim(),
      price: price,
      originalPrice: parseFloat($product.find('.original-price').text().replace(/[^0-9.]/g, '')) || null,
      discount: $product.find('.discount').text().trim(),
      rating: parseFloat($product.find('.rating').attr('data-rating')),
      reviews: parseInt($product.find('.review-count').text().replace(/[^0-9]/g, '')),
      image: $product.find('img').attr('src'),
      link: $product.find('a.product-link').attr('href')
    });
  });
  
  return products;
}

javascript
async function scrapeMultiplePages(baseUrl, maxPages) {
  const allData = [];
  
  for (let page = 1; page <= maxPages; page++) {
    const url = `${baseUrl}?page=${page}`;
    console.log(`Scraping page ${page}...`);
    
    try {
      const response = await axios.get(url);
      const $ = cheerio.load(response.data);
      
      $('.item').each((index, element) => {
        allData.push({
          id: $(element).attr('data-id'),
          title: $(element).find('.title').text().trim(),
          content: $(element).find('.content').text().trim()
        });
      });
      
      // Delay to avoid being blocked
      await new Promise(resolve => setTimeout(resolve, 1000));
      
    } catch (error) {
      console.error(`Page ${page} scraping failed:`, error.message);
    }
  }
  
  return allData;
}

5. Handling Relative URLs

javascript
const URL = require('url');

function resolveUrl(base, relative) {
  return URL.resolve(base, relative);
}

// Usage example
const baseUrl = 'https://example.com';
const relativeLink = '/article/123';
const absoluteUrl = resolveUrl(baseUrl, relativeLink);
// Result: https://example.com/article/123

6. Data Cleaning and Validation

javascript
function cleanData(rawData) {
  return rawData.map(item => ({
    title: item.title.replace(/\s+/g, ' ').trim(),
    price: parseFloat(item.price) || 0,
    description: item.description
      .replace(/<[^>]*>/g, '')  // Remove HTML tags
      .replace(/\s+/g, ' ')     // Merge spaces
      .trim(),
    date: new Date(item.date),
    isValid: item.title.length > 0 && item.price > 0
  })).filter(item => item.isValid);
}

7. Error Handling and Retry Mechanism

javascript
async function fetchWithRetry(url, maxRetries = 3) {
  for (let i = 0; i < maxRetries; i++) {
    try {
      const response = await axios.get(url, {
        timeout: 10000,
        headers: {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
      });
      return response.data;
    } catch (error) {
      console.log(`Attempt ${i + 1}/${maxRetries} failed:`, error.message);
      
      if (i === maxRetries - 1) {
        throw error;
      }
      
      // Exponential backoff
      await new Promise(resolve => setTimeout(resolve, Math.pow(2, i) * 1000));
    }
  }
}

8. Saving Data to File

javascript
const fs = require('fs');

function saveToFile(data, filename) {
  const jsonData = JSON.stringify(data, null, 2);
  fs.writeFileSync(filename, jsonData, 'utf8');
  console.log(`Data saved to ${filename}`);
}

// Save as CSV
function saveToCSV(data, filename) {
  if (data.length === 0) return;
  
  const headers = Object.keys(data[0]).join(',');
  const rows = data.map(item => 
    Object.values(item).map(value => 
      `"${String(value).replace(/"/g, '""')}"`
    ).join(',')
  );
  
  const csv = [headers, ...rows].join('\n');
  fs.writeFileSync(filename, csv, 'utf8');
  console.log(`CSV saved to ${filename}`);
}

9. Complete Scraper Example

javascript
const cheerio = require('cheerio');
const axios = require('axios');
const fs = require('fs');

class WebScraper {
  constructor(baseUrl) {
    this.baseUrl = baseUrl;
    this.data = [];
  }
  
  async scrape(maxPages = 5) {
    for (let page = 1; page <= maxPages; page++) {
      await this.scrapePage(page);
      await this.delay(1000);
    }
    
    this.saveData();
    return this.data;
  }
  
  async scrapePage(page) {
    const url = `${this.baseUrl}?page=${page}`;
    console.log(`Scraping: ${url}`);
    
    try {
      const html = await fetchWithRetry(url);
      const $ = cheerio.load(html);
      
      $('.article').each((index, element) => {
        this.data.push(this.extractData($, element));
      });
      
      console.log(`Page ${page} completed, total ${this.data.length} items`);
      
    } catch (error) {
      console.error(`Page ${page} scraping failed:`, error.message);
    }
  }
  
  extractData($, element) {
    const $el = $(element);
    return {
      title: $el.find('.title').text().trim(),
      author: $el.find('.author').text().trim(),
      date: $el.find('.date').text().trim(),
      content: $el.find('.content').text().trim(),
      link: $el.find('a').attr('href'),
      tags: $el.find('.tag').map((i, tag) => $(tag).text()).get()
    };
  }
  
  saveData() {
    const filename = `scraped_data_${Date.now()}.json`;
    fs.writeFileSync(filename, JSON.stringify(this.data, null, 2));
    console.log(`Data saved to ${filename}`);
  }
  
  delay(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
}

// Usage example
async function main() {
  const scraper = new WebScraper('https://example-blog.com/articles');
  const data = await scraper.scrape(10);
  console.log(`Scraping completed, total ${data.length} items`);
}

main().catch(console.error);

Best Practices

Set reasonable delays: Avoid frequent requests that could lead to being blocked
Use User-Agent: Simulate real browser requests
Handle exceptions: Comprehensive error handling and retry mechanisms
Data validation: Clean and validate extracted data
Respect robots.txt: Follow website crawling rules
Incremental updates: Only scrape new or changed data
Concurrency control: Use queues to control the number of concurrent requests

1. Basic Scraper Architecture

2. News Website Scraping Example

3. E-commerce Product Scraping

4. Pagination Scraping

5. Handling Relative URLs

6. Data Cleaning and Validation

7. Error Handling and Retry Mechanism

8. Saving Data to File

9. Complete Scraper Example

Best Practices