Cheerio excels in web scraping and data extraction due to its lightweight, fast, and easy-to-use nature. Here's a complete guide to using Cheerio for web scraping:
1. Basic Scraper Architecture
javascriptconst cheerio = require('cheerio'); const axios = require('axios'); async function scrapePage(url) { try { // 1. Send HTTP request to get HTML const response = await axios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } }); // 2. Load HTML with Cheerio const $ = cheerio.load(response.data); // 3. Extract data const data = { title: $('title').text(), description: $('meta[name="description"]').attr('content'), links: [] }; // 4. Extract all links $('a[href]').each((index, element) => { data.links.push({ text: $(element).text().trim(), href: $(element).attr('href') }); }); return data; } catch (error) { console.error('Scraping failed:', error.message); throw error; } }
2. News Website Scraping Example
javascriptasync function scrapeNews() { const url = 'https://example-news.com'; const response = await axios.get(url); const $ = cheerio.load(response.data); const articles = []; $('.news-item').each((index, element) => { const $item = $(element); articles.push({ title: $item.find('.title').text().trim(), link: $item.find('a').attr('href'), summary: $item.find('.summary').text().trim(), date: $item.find('.date').text().trim(), author: $item.find('.author').text().trim() }); }); return articles; }
3. E-commerce Product Scraping
javascriptasync function scrapeProducts() { const url = 'https://example-shop.com/products'; const response = await axios.get(url); const $ = cheerio.load(response.data); const products = []; $('.product-card').each((index, element) => { const $product = $(element); const priceText = $product.find('.price').text(); const price = parseFloat(priceText.replace(/[^0-9.]/g, '')); products.push({ name: $product.find('.product-name').text().trim(), price: price, originalPrice: parseFloat($product.find('.original-price').text().replace(/[^0-9.]/g, '')) || null, discount: $product.find('.discount').text().trim(), rating: parseFloat($product.find('.rating').attr('data-rating')), reviews: parseInt($product.find('.review-count').text().replace(/[^0-9]/g, '')), image: $product.find('img').attr('src'), link: $product.find('a.product-link').attr('href') }); }); return products; }
4. Pagination Scraping
javascriptasync function scrapeMultiplePages(baseUrl, maxPages) { const allData = []; for (let page = 1; page <= maxPages; page++) { const url = `${baseUrl}?page=${page}`; console.log(`Scraping page ${page}...`); try { const response = await axios.get(url); const $ = cheerio.load(response.data); $('.item').each((index, element) => { allData.push({ id: $(element).attr('data-id'), title: $(element).find('.title').text().trim(), content: $(element).find('.content').text().trim() }); }); // Delay to avoid being blocked await new Promise(resolve => setTimeout(resolve, 1000)); } catch (error) { console.error(`Page ${page} scraping failed:`, error.message); } } return allData; }
5. Handling Relative URLs
javascriptconst URL = require('url'); function resolveUrl(base, relative) { return URL.resolve(base, relative); } // Usage example const baseUrl = 'https://example.com'; const relativeLink = '/article/123'; const absoluteUrl = resolveUrl(baseUrl, relativeLink); // Result: https://example.com/article/123
6. Data Cleaning and Validation
javascriptfunction cleanData(rawData) { return rawData.map(item => ({ title: item.title.replace(/\s+/g, ' ').trim(), price: parseFloat(item.price) || 0, description: item.description .replace(/<[^>]*>/g, '') // Remove HTML tags .replace(/\s+/g, ' ') // Merge spaces .trim(), date: new Date(item.date), isValid: item.title.length > 0 && item.price > 0 })).filter(item => item.isValid); }
7. Error Handling and Retry Mechanism
javascriptasync function fetchWithRetry(url, maxRetries = 3) { for (let i = 0; i < maxRetries; i++) { try { const response = await axios.get(url, { timeout: 10000, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } }); return response.data; } catch (error) { console.log(`Attempt ${i + 1}/${maxRetries} failed:`, error.message); if (i === maxRetries - 1) { throw error; } // Exponential backoff await new Promise(resolve => setTimeout(resolve, Math.pow(2, i) * 1000)); } } }
8. Saving Data to File
javascriptconst fs = require('fs'); function saveToFile(data, filename) { const jsonData = JSON.stringify(data, null, 2); fs.writeFileSync(filename, jsonData, 'utf8'); console.log(`Data saved to ${filename}`); } // Save as CSV function saveToCSV(data, filename) { if (data.length === 0) return; const headers = Object.keys(data[0]).join(','); const rows = data.map(item => Object.values(item).map(value => `"${String(value).replace(/"/g, '""')}"` ).join(',') ); const csv = [headers, ...rows].join('\n'); fs.writeFileSync(filename, csv, 'utf8'); console.log(`CSV saved to ${filename}`); }
9. Complete Scraper Example
javascriptconst cheerio = require('cheerio'); const axios = require('axios'); const fs = require('fs'); class WebScraper { constructor(baseUrl) { this.baseUrl = baseUrl; this.data = []; } async scrape(maxPages = 5) { for (let page = 1; page <= maxPages; page++) { await this.scrapePage(page); await this.delay(1000); } this.saveData(); return this.data; } async scrapePage(page) { const url = `${this.baseUrl}?page=${page}`; console.log(`Scraping: ${url}`); try { const html = await fetchWithRetry(url); const $ = cheerio.load(html); $('.article').each((index, element) => { this.data.push(this.extractData($, element)); }); console.log(`Page ${page} completed, total ${this.data.length} items`); } catch (error) { console.error(`Page ${page} scraping failed:`, error.message); } } extractData($, element) { const $el = $(element); return { title: $el.find('.title').text().trim(), author: $el.find('.author').text().trim(), date: $el.find('.date').text().trim(), content: $el.find('.content').text().trim(), link: $el.find('a').attr('href'), tags: $el.find('.tag').map((i, tag) => $(tag).text()).get() }; } saveData() { const filename = `scraped_data_${Date.now()}.json`; fs.writeFileSync(filename, JSON.stringify(this.data, null, 2)); console.log(`Data saved to ${filename}`); } delay(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } } // Usage example async function main() { const scraper = new WebScraper('https://example-blog.com/articles'); const data = await scraper.scrape(10); console.log(`Scraping completed, total ${data.length} items`); } main().catch(console.error);
Best Practices
- Set reasonable delays: Avoid frequent requests that could lead to being blocked
- Use User-Agent: Simulate real browser requests
- Handle exceptions: Comprehensive error handling and retry mechanisms
- Data validation: Clean and validate extracted data
- Respect robots.txt: Follow website crawling rules
- Incremental updates: Only scrape new or changed data
- Concurrency control: Use queues to control the number of concurrent requests