乐闻世界logo
搜索文章和话题

How does Cheerio handle dynamically loaded content? What are the solutions?

2月22日 14:30

Cheerio itself doesn't support handling dynamically loaded content because it's just an HTML parser and doesn't execute JavaScript. However, we can handle dynamic content by combining it with other tools in various ways:

1. Using Puppeteer + Cheerio Combination

This is the most common approach - use Puppeteer to load dynamic pages first, then use Cheerio to parse:

javascript
const puppeteer = require('puppeteer'); const cheerio = require('cheerio'); async function scrapeDynamicContent(url) { const browser = await puppeteer.launch({ headless: true }); const page = await browser.newPage(); // Visit page and wait for dynamic content to load await page.goto(url, { waitUntil: 'networkidle2' }); // Wait for specific element to appear await page.waitForSelector('.dynamic-content', { timeout: 10000 }); // Get complete HTML const html = await page.content(); // Close browser await browser.close(); // Parse with Cheerio const $ = cheerio.load(html); // Extract data const data = []; $('.dynamic-item').each((index, element) => { data.push({ title: $(element).find('.title').text().trim(), content: $(element).find('.content').text().trim(), link: $(element).find('a').attr('href') }); }); return data; }

2. Handling Infinite Scroll Pages

javascript
async function scrapeInfiniteScroll(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url); let previousHeight = 0; let currentHeight = await page.evaluate('document.body.scrollHeight'); // Scroll until no new content loads while (currentHeight > previousHeight) { previousHeight = currentHeight; // Scroll to bottom await page.evaluate('window.scrollTo(0, document.body.scrollHeight)'); // Wait for new content to load await page.waitForTimeout(2000); currentHeight = await page.evaluate('document.body.scrollHeight'); } // Get HTML and parse with Cheerio const html = await page.content(); const $ = cheerio.load(html); const items = $('.item').map((i, el) => ({ title: $(el).find('.title').text(), link: $(el).find('a').attr('href') })).get(); await browser.close(); return items; }

3. Handling Lazy-loaded Images

javascript
async function scrapeLazyImages(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url); // Scroll page to trigger lazy loading const scrollHeight = await page.evaluate('document.body.scrollHeight'); for (let i = 0; i < scrollHeight; i += 500) { await page.evaluate(`window.scrollTo(0, ${i})`); await page.waitForTimeout(500); } // Wait for all images to load await page.evaluate(async () => { const images = Array.from(document.querySelectorAll('img[data-src]')); await Promise.all(images.map(img => { if (img.dataset.src) { img.src = img.dataset.src; } })); }); await page.waitForTimeout(2000); const html = await page.content(); const $ = cheerio.load(html); const images = $('img').map((i, el) => ({ src: $(el).attr('src'), alt: $(el).attr('alt') })).get(); await browser.close(); return images; }

4. Handling Click-to-Load More

javascript
async function scrapeClickToLoad(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url); let hasMore = true; const allItems = []; while (hasMore) { // Wait for content to load await page.waitForSelector('.item'); // Get current page HTML const html = await page.content(); const $ = cheerio.load(html); // Extract current page data $('.item').each((index, element) => { const title = $(element).find('.title').text(); // Avoid duplicate additions if (!allItems.some(item => item.title === title)) { allItems.push({ title }); } }); // Check if there's a "load more" button const loadMoreButton = await page.$('.load-more'); if (loadMoreButton) { // Click load more await loadMoreButton.click(); // Wait for new content to load await page.waitForTimeout(2000); } else { hasMore = false; } } await browser.close(); return allItems; }

5. Handling AJAX Requests

javascript
async function scrapeAJAXContent(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); // Listen to network requests const apiData = []; page.on('response', async response => { const url = response.url(); if (url.includes('/api/data')) { const data = await response.json(); apiData.push(...data); } }); await page.goto(url); // Wait for AJAX request to complete await page.waitForSelector('.data-loaded', { timeout: 15000 }); await browser.close(); return apiData; }

6. Using Playwright as Alternative

Playwright is another powerful browser automation tool:

javascript
const { chromium } = require('playwright'); const cheerio = require('cheerio'); async function scrapeWithPlaywright(url) { const browser = await chromium.launch(); const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle' }); // Wait for dynamic content await page.waitForSelector('.dynamic-content'); const html = await page.content(); await browser.close(); const $ = cheerio.load(html); return $('.item').map((i, el) => $(el).text()).get(); }

7. Performance Optimization Tips

javascript
// 1. Disable unnecessary resource loading async function optimizedScrape(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); // Block images, fonts and other resources await page.setRequestInterception(true); page.on('request', request => { const resourceType = request.resourceType(); if (['image', 'font', 'stylesheet'].includes(resourceType)) { request.abort(); } else { request.continue(); } }); await page.goto(url); const html = await page.content(); await browser.close(); return cheerio.load(html); } // 2. Reuse browser instance class Scraper { constructor() { this.browser = null; } async init() { this.browser = await puppeteer.launch(); } async scrape(url) { const page = await this.browser.newPage(); await page.goto(url); const html = await page.content(); await page.close(); return cheerio.load(html); } async close() { await this.browser.close(); } } // Usage example async function main() { const scraper = new Scraper(); await scraper.init(); const urls = ['url1', 'url2', 'url3']; const results = []; for (const url of urls) { const $ = await scraper.scrape(url); results.push($('.title').text()); } await scraper.close(); return results; }

8. Error Handling and Retry

javascript
async function scrapeWithRetry(url, maxRetries = 3) { for (let i = 0; i < maxRetries; i++) { try { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url, { timeout: 30000 }); await page.waitForSelector('.content', { timeout: 10000 }); const html = await page.content(); await browser.close(); return cheerio.load(html); } catch (error) { console.log(`Attempt ${i + 1} failed:`, error.message); if (i === maxRetries - 1) { throw error; } await new Promise(resolve => setTimeout(resolve, 2000)); } } }

Summary

Although Cheerio itself cannot handle dynamic content, by combining it with browser automation tools like Puppeteer and Playwright, we can effectively handle various dynamic loading scenarios. The key points are:

  1. Waiting strategies: Use waitForSelector, waitForTimeout, etc. to ensure content is loaded
  2. Performance optimization: Disable unnecessary resource loading, reuse browser instances
  3. Error handling: Implement retry mechanisms, handle network exceptions
  4. Hybrid usage: Use browser tools to load dynamic content first, then use Cheerio for fast parsing
标签:NodeJSCheerio