Cheerio 在网页爬虫和数据抓取方面表现出色,因为它轻量、快速且易于使用。以下是使用 Cheerio 进行网页爬虫的完整指南:
1. 基本爬虫架构
javascriptconst cheerio = require('cheerio'); const axios = require('axios'); async function scrapePage(url) { try { // 1. 发送 HTTP 请求获取 HTML const response = await axios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } }); // 2. 使用 Cheerio 加载 HTML const $ = cheerio.load(response.data); // 3. 提取数据 const data = { title: $('title').text(), description: $('meta[name="description"]').attr('content'), links: [] }; // 4. 提取所有链接 $('a[href]').each((index, element) => { data.links.push({ text: $(element).text().trim(), href: $(element).attr('href') }); }); return data; } catch (error) { console.error('爬取失败:', error.message); throw error; } }
2. 抓取新闻网站示例
javascriptasync function scrapeNews() { const url = 'https://example-news.com'; const response = await axios.get(url); const $ = cheerio.load(response.data); const articles = []; $('.news-item').each((index, element) => { const $item = $(element); articles.push({ title: $item.find('.title').text().trim(), link: $item.find('a').attr('href'), summary: $item.find('.summary').text().trim(), date: $item.find('.date').text().trim(), author: $item.find('.author').text().trim() }); }); return articles; }
3. 抓取电商产品信息
javascriptasync function scrapeProducts() { const url = 'https://example-shop.com/products'; const response = await axios.get(url); const $ = cheerio.load(response.data); const products = []; $('.product-card').each((index, element) => { const $product = $(element); const priceText = $product.find('.price').text(); const price = parseFloat(priceText.replace(/[^0-9.]/g, '')); products.push({ name: $product.find('.product-name').text().trim(), price: price, originalPrice: parseFloat($product.find('.original-price').text().replace(/[^0-9.]/g, '')) || null, discount: $product.find('.discount').text().trim(), rating: parseFloat($product.find('.rating').attr('data-rating')), reviews: parseInt($product.find('.review-count').text().replace(/[^0-9]/g, '')), image: $product.find('img').attr('src'), link: $product.find('a.product-link').attr('href') }); }); return products; }
4. 分页爬取
javascriptasync function scrapeMultiplePages(baseUrl, maxPages) { const allData = []; for (let page = 1; page <= maxPages; page++) { const url = `${baseUrl}?page=${page}`; console.log(`正在爬取第 ${page} 页...`); try { const response = await axios.get(url); const $ = cheerio.load(response.data); $('.item').each((index, element) => { allData.push({ id: $(element).attr('data-id'), title: $(element).find('.title').text().trim(), content: $(element).find('.content').text().trim() }); }); // 延迟避免被封 await new Promise(resolve => setTimeout(resolve, 1000)); } catch (error) { console.error(`第 ${page} 页爬取失败:`, error.message); } } return allData; }
5. 处理相对 URL
javascriptconst URL = require('url'); function resolveUrl(base, relative) { return URL.resolve(base, relative); } // 使用示例 const baseUrl = 'https://example.com'; const relativeLink = '/article/123'; const absoluteUrl = resolveUrl(baseUrl, relativeLink); // 结果: https://example.com/article/123
6. 数据清洗和验证
javascriptfunction cleanData(rawData) { return rawData.map(item => ({ title: item.title.replace(/\s+/g, ' ').trim(), price: parseFloat(item.price) || 0, description: item.description .replace(/<[^>]*>/g, '') // 移除 HTML 标签 .replace(/\s+/g, ' ') // 合并空格 .trim(), date: new Date(item.date), isValid: item.title.length > 0 && item.price > 0 })).filter(item => item.isValid); }
7. 错误处理和重试机制
javascriptasync function fetchWithRetry(url, maxRetries = 3) { for (let i = 0; i < maxRetries; i++) { try { const response = await axios.get(url, { timeout: 10000, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } }); return response.data; } catch (error) { console.log(`尝试 ${i + 1}/${maxRetries} 失败:`, error.message); if (i === maxRetries - 1) { throw error; } // 指数退避 await new Promise(resolve => setTimeout(resolve, Math.pow(2, i) * 1000)); } } }
8. 保存数据到文件
javascriptconst fs = require('fs'); function saveToFile(data, filename) { const jsonData = JSON.stringify(data, null, 2); fs.writeFileSync(filename, jsonData, 'utf8'); console.log(`数据已保存到 ${filename}`); } // 保存为 CSV function saveToCSV(data, filename) { if (data.length === 0) return; const headers = Object.keys(data[0]).join(','); const rows = data.map(item => Object.values(item).map(value => `"${String(value).replace(/"/g, '""')}"` ).join(',') ); const csv = [headers, ...rows].join('\n'); fs.writeFileSync(filename, csv, 'utf8'); console.log(`CSV 已保存到 ${filename}`); }
9. 完整爬虫示例
javascriptconst cheerio = require('cheerio'); const axios = require('axios'); const fs = require('fs'); class WebScraper { constructor(baseUrl) { this.baseUrl = baseUrl; this.data = []; } async scrape(maxPages = 5) { for (let page = 1; page <= maxPages; page++) { await this.scrapePage(page); await this.delay(1000); } this.saveData(); return this.data; } async scrapePage(page) { const url = `${this.baseUrl}?page=${page}`; console.log(`正在爬取: ${url}`); try { const html = await fetchWithRetry(url); const $ = cheerio.load(html); $('.article').each((index, element) => { this.data.push(this.extractData($, element)); }); console.log(`第 ${page} 页完成,共 ${this.data.length} 条数据`); } catch (error) { console.error(`第 ${page} 页爬取失败:`, error.message); } } extractData($, element) { const $el = $(element); return { title: $el.find('.title').text().trim(), author: $el.find('.author').text().trim(), date: $el.find('.date').text().trim(), content: $el.find('.content').text().trim(), link: $el.find('a').attr('href'), tags: $el.find('.tag').map((i, tag) => $(tag).text()).get() }; } saveData() { const filename = `scraped_data_${Date.now()}.json`; fs.writeFileSync(filename, JSON.stringify(this.data, null, 2)); console.log(`数据已保存到 ${filename}`); } delay(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } } // 使用示例 async function main() { const scraper = new WebScraper('https://example-blog.com/articles'); const data = await scraper.scrape(10); console.log(`爬取完成,共 ${data.length} 条数据`); } main().catch(console.error);
最佳实践
- 设置合理的延迟:避免频繁请求导致被封
- 使用 User-Agent:模拟真实浏览器请求
- 处理异常:完善的错误处理和重试机制
- 数据验证:清洗和验证提取的数据
- 遵守 robots.txt:尊重网站的爬虫规则
- 增量更新:只抓取新增或变化的数据
- 并发控制:使用队列控制并发请求数量