Cheerio 本身不支持处理动态加载的内容,因为它只是一个 HTML 解析器,不会执行 JavaScript。但是,我们可以通过多种方式结合其他工具来处理动态内容:
1. 使用 Puppeteer + Cheerio 组合
这是最常用的方案,先用 Puppeteer 加载动态页面,然后用 Cheerio 解析:
javascriptconst puppeteer = require('puppeteer'); const cheerio = require('cheerio'); async function scrapeDynamicContent(url) { const browser = await puppeteer.launch({ headless: true }); const page = await browser.newPage(); // 访问页面并等待动态内容加载 await page.goto(url, { waitUntil: 'networkidle2' }); // 等待特定元素出现 await page.waitForSelector('.dynamic-content', { timeout: 10000 }); // 获取完整的 HTML const html = await page.content(); // 关闭浏览器 await browser.close(); // 使用 Cheerio 解析 const $ = cheerio.load(html); // 提取数据 const data = []; $('.dynamic-item').each((index, element) => { data.push({ title: $(element).find('.title').text().trim(), content: $(element).find('.content').text().trim(), link: $(element).find('a').attr('href') }); }); return data; }
2. 处理无限滚动页面
javascriptasync function scrapeInfiniteScroll(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url); let previousHeight = 0; let currentHeight = await page.evaluate('document.body.scrollHeight'); // 滚动直到没有新内容加载 while (currentHeight > previousHeight) { previousHeight = currentHeight; // 滚动到底部 await page.evaluate('window.scrollTo(0, document.body.scrollHeight)'); // 等待新内容加载 await page.waitForTimeout(2000); currentHeight = await page.evaluate('document.body.scrollHeight'); } // 获取 HTML 并用 Cheerio 解析 const html = await page.content(); const $ = cheerio.load(html); const items = $('.item').map((i, el) => ({ title: $(el).find('.title').text(), link: $(el).find('a').attr('href') })).get(); await browser.close(); return items; }
3. 处理懒加载图片
javascriptasync function scrapeLazyImages(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url); // 滚动页面触发懒加载 const scrollHeight = await page.evaluate('document.body.scrollHeight'); for (let i = 0; i < scrollHeight; i += 500) { await page.evaluate(`window.scrollTo(0, ${i})`); await page.waitForTimeout(500); } // 等待所有图片加载完成 await page.evaluate(async () => { const images = Array.from(document.querySelectorAll('img[data-src]')); await Promise.all(images.map(img => { if (img.dataset.src) { img.src = img.dataset.src; } })); }); await page.waitForTimeout(2000); const html = await page.content(); const $ = cheerio.load(html); const images = $('img').map((i, el) => ({ src: $(el).attr('src'), alt: $(el).attr('alt') })).get(); await browser.close(); return images; }
4. 处理点击加载更多
javascriptasync function scrapeClickToLoad(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url); let hasMore = true; const allItems = []; while (hasMore) { // 等待内容加载 await page.waitForSelector('.item'); // 获取当前页面的 HTML const html = await page.content(); const $ = cheerio.load(html); // 提取当前页面的数据 $('.item').each((index, element) => { const title = $(element).find('.title').text(); // 避免重复添加 if (!allItems.some(item => item.title === title)) { allItems.push({ title }); } }); // 检查是否有"加载更多"按钮 const loadMoreButton = await page.$('.load-more'); if (loadMoreButton) { // 点击加载更多 await loadMoreButton.click(); // 等待新内容加载 await page.waitForTimeout(2000); } else { hasMore = false; } } await browser.close(); return allItems; }
5. 处理 AJAX 请求
javascriptasync function scrapeAJAXContent(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); // 监听网络请求 const apiData = []; page.on('response', async response => { const url = response.url(); if (url.includes('/api/data')) { const data = await response.json(); apiData.push(...data); } }); await page.goto(url); // 等待 AJAX 请求完成 await page.waitForSelector('.data-loaded', { timeout: 15000 }); await browser.close(); return apiData; }
6. 使用 Playwright 替代方案
Playwright 是另一个强大的浏览器自动化工具:
javascriptconst { chromium } = require('playwright'); const cheerio = require('cheerio'); async function scrapeWithPlaywright(url) { const browser = await chromium.launch(); const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle' }); // 等待动态内容 await page.waitForSelector('.dynamic-content'); const html = await page.content(); await browser.close(); const $ = cheerio.load(html); return $('.item').map((i, el) => $(el).text()).get(); }
7. 优化性能的技巧
javascript// 1. 禁用不必要的资源加载 async function optimizedScrape(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); // 阻止图片、字体等资源加载 await page.setRequestInterception(true); page.on('request', request => { const resourceType = request.resourceType(); if (['image', 'font', 'stylesheet'].includes(resourceType)) { request.abort(); } else { request.continue(); } }); await page.goto(url); const html = await page.content(); await browser.close(); return cheerio.load(html); } // 2. 复用浏览器实例 class Scraper { constructor() { this.browser = null; } async init() { this.browser = await puppeteer.launch(); } async scrape(url) { const page = await this.browser.newPage(); await page.goto(url); const html = await page.content(); await page.close(); return cheerio.load(html); } async close() { await this.browser.close(); } } // 使用示例 async function main() { const scraper = new Scraper(); await scraper.init(); const urls = ['url1', 'url2', 'url3']; const results = []; for (const url of urls) { const $ = await scraper.scrape(url); results.push($('.title').text()); } await scraper.close(); return results; }
8. 错误处理和重试
javascriptasync function scrapeWithRetry(url, maxRetries = 3) { for (let i = 0; i < maxRetries; i++) { try { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url, { timeout: 30000 }); await page.waitForSelector('.content', { timeout: 10000 }); const html = await page.content(); await browser.close(); return cheerio.load(html); } catch (error) { console.log(`尝试 ${i + 1} 失败:`, error.message); if (i === maxRetries - 1) { throw error; } await new Promise(resolve => setTimeout(resolve, 2000)); } } }
总结
虽然 Cheerio 本身无法处理动态内容,但通过与 Puppeteer、Playwright 等浏览器自动化工具结合,可以有效地处理各种动态加载场景。关键在于:
- 等待策略:使用
waitForSelector、waitForTimeout等确保内容加载完成 - 性能优化:禁用不必要的资源加载,复用浏览器实例
- 错误处理:实现重试机制,处理网络异常
- 混合使用:先用浏览器工具加载动态内容,再用 Cheerio 快速解析