乐闻世界logo
搜索文章和话题

What are the common problems when using Cheerio? How to solve these problems?

2月22日 14:30

Cheerio provides a rich API, but developers often encounter some common issues during actual use. Here are common problems and their solutions when using Cheerio:

1. Chinese Character Encoding Issues

Problem Description

When scraping web pages containing Chinese characters, garbled text appears.

Solution

javascript
const axios = require('axios'); const cheerio = require('cheerio'); const iconv = require('iconv-lite'); async function scrapeWithEncoding(url) { // Solution 1: Set response type to arraybuffer const response = await axios.get(url, { responseType: 'arraybuffer', headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } }); // Solution 2: Detect encoding and convert let html = response.data; // Detect encoding from Content-Type const contentType = response.headers['content-type'] || ''; const charsetMatch = contentType.match(/charset=([^;]+)/i); if (charsetMatch) { const charset = charsetMatch[1].toLowerCase(); if (charset !== 'utf-8') { html = iconv.decode(Buffer.from(html), charset); } } // Solution 3: Get encoding from HTML meta tag const $temp = cheerio.load(html); const metaCharset = $temp('meta[charset]').attr('charset'); if (metaCharset && metaCharset.toLowerCase() !== 'utf-8') { html = iconv.decode(Buffer.from(html), metaCharset); } const $ = cheerio.load(html); return $('title').text(); }

2. Selector Not Finding Elements

Problem Description

When using selectors to query, empty results are returned, but the elements actually exist.

Solution

javascript
const cheerio = require('cheerio'); const html = ` <div class="container"> <p class="text">Hello</p> </div> `; const $ = cheerio.load(html); // Problem: Wrong selector console.log($('.container p.text').length); // 1 // Solution 1: Check selector syntax console.log($('.container > p.text').length); // 1 // Solution 2: Use more relaxed selector console.log($('.container .text').length); // 1 // Solution 3: Debug step by step console.log($('.container').length); // 1 console.log($('.container p').length); // 1 console.log($('.container p').hasClass('text')); // true // Solution 4: Use contains() to find elements containing text console.log($('p:contains("Hello")').length); // 1 // Solution 5: Check if HTML is loaded correctly console.log($.html()); // View complete HTML

3. Unable to Get Dynamic Content

Problem Description

Content dynamically loaded by JavaScript on the page cannot be retrieved.

Solution

javascript
const cheerio = require('cheerio'); const axios = require('axios'); // Problem: Cannot get dynamic content directly with Cheerio async function scrapeStatic() { const response = await axios.get('https://example.com/dynamic'); const $ = cheerio.load(response.data); console.log($('.dynamic-content').text()); // Empty } // Solution: Combine with Puppeteer const puppeteer = require('puppeteer'); async function scrapeDynamic() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com/dynamic'); // Wait for dynamic content to load await page.waitForSelector('.dynamic-content'); const html = await page.content(); await browser.close(); const $ = cheerio.load(html); console.log($('.dynamic-content').text()); // Has content } // Solution: Call API directly async function scrapeAPI() { const response = await axios.get('https://example.com/api/data'); const data = response.data; console.log(data); // Get JSON data directly }

4. High Memory Usage

Problem Description

When processing large amounts of HTML, memory usage is too high, causing the program to crash.

Solution

javascript
const cheerio = require('cheerio'); // Problem: Process large file at once function processLargeFileBad(html) { const $ = cheerio.load(html); const results = []; // Process millions of elements $('.item').each((i, el) => { results.push({ title: $(el).find('.title').text(), content: $(el).find('.content').text() }); }); return results; } // Solution 1: Process in batches function processLargeFileGood(html) { const $ = cheerio.load(html); const batchSize = 1000; const total = $('.item').length; const results = []; for (let i = 0; i < total; i += batchSize) { const $batch = $('.item').slice(i, i + batchSize); const batchData = $batch.map((j, el) => ({ title: $(el).find('.title').text(), content: $(el).find('.content').text() })).get(); results.push(...batchData); // Clean up promptly $batch = null; // Force garbage collection (development environment) if (global.gc) { global.gc(); } } return results; } // Solution 2: Use stream processing const fs = require('fs'); const { Transform } = require('stream'); function processWithStream(filePath) { return new Promise((resolve, reject) => { const results = []; let buffer = ''; const transformStream = new Transform({ transform(chunk, encoding, callback) { buffer += chunk.toString(); // Split and process by tags const items = buffer.match(/<item[^>]*>[\s\S]*?<\/item>/g) || []; items.forEach(item => { const $ = cheerio.load(item); results.push({ title: $('.title').text(), content: $('.content').text() }); }); // Clean up processed content const lastIndex = buffer.lastIndexOf('</item>'); if (lastIndex !== -1) { buffer = buffer.slice(lastIndex + 7); } callback(); }, flush(callback) { resolve(results); callback(); } }); fs.createReadStream(filePath) .pipe(transformStream) .on('error', reject); }); }

5. Relative Path Handling Issues

Problem Description

Extracted links are relative paths and cannot be accessed directly.

Solution

javascript
const cheerio = require('cheerio'); const { URL } = require('url'); function resolveLinks(html, baseUrl) { const $ = cheerio.load(html); const links = []; $('a[href]').each((i, el) => { const href = $(el).attr('href'); const absoluteUrl = new URL(href, baseUrl).href; links.push({ text: $(el).text().trim(), href: href, absoluteUrl: absoluteUrl }); }); return links; } // Usage example const html = ` <a href="/page1">Page 1</a> <a href="../page2">Page 2</a> <a href="https://example.com/page3">Page 3</a> `; const links = resolveLinks(html, 'https://example.com/dir/index.html'); console.log(links);

6. Form Data Extraction Issues

Problem Description

Encounter complex situations like checkboxes and multi-select boxes when extracting form data.

Solution

javascript
const cheerio = require('cheerio'); function extractFormData(html) { const $ = cheerio.load(html); const formData = {}; // Text input $('input[type="text"]').each((i, el) => { const name = $(el).attr('name'); const value = $(el).val() || ''; formData[name] = value; }); // Checkbox (multiple selection) $('input[type="checkbox"]:checked').each((i, el) => { const name = $(el).attr('name'); const value = $(el).val(); if (!formData[name]) { formData[name] = []; } formData[name].push(value); }); // Radio button $('input[type="radio"]:checked').each((i, el) => { const name = $(el).attr('name'); const value = $(el).val(); formData[name] = value; }); // Dropdown selection $('select').each((i, el) => { const name = $(el).attr('name'); const selectedOption = $(el).find('option:selected'); formData[name] = selectedOption.val(); }); // Multi-select dropdown $('select[multiple]').each((i, el) => { const name = $(el).attr('name'); const selectedOptions = $(el).find('option:selected'); formData[name] = selectedOptions.map((j, opt) => $(opt).val()).get(); }); // Textarea $('textarea').each((i, el) => { const name = $(el).attr('name'); const value = $(el).val() || ''; formData[name] = value; }); return formData; }

7. HTML Entity Encoding Issues

Problem Description

Special characters in HTML are encoded, such as &nbsp;, &amp;, etc.

Solution

javascript
const cheerio = require('cheerio'); const html = '<div>Hello &amp; World &nbsp; Test</div>'; // Problem: Entities are decoded by default const $ = cheerio.load(html); console.log($('.div').text()); // "Hello & World Test" // Solution 1: Disable entity decoding const $2 = cheerio.load(html, { decodeEntities: false }); console.log($2('.div').text()); // "Hello &amp; World &nbsp; Test" // Solution 2: Manually handle entities const he = require('he'); const text = he.decode($('.div').text()); console.log(text); // "Hello & World Test" // Solution 3: Use html() method to get raw HTML const rawHtml = $('.div').html(); console.log(rawHtml); // "Hello &amp; World &nbsp; Test"

8. Performance Issues

Problem Description

Poor performance when processing large amounts of data.

Solution

javascript
const cheerio = require('cheerio'); // Problem: Using complex selectors function slowQuery($) { return $('div div div p span a').text(); } // Solution 1: Use more specific selectors function fastQuery1($) { return $('.container .link').text(); } // Solution 2: Use find() method function fastQuery2($) { return $('.container').find('.link').text(); } // Solution 3: Cache selector results function fastQuery3($) { const $container = $('.container'); return $container.find('.link').text(); } // Solution 4: Use native methods function fastestQuery($) { const container = $('.container')[0]; return container.querySelector('.link').textContent; }

9. Whitespace Character Handling

Problem Description

Extracted text contains a lot of whitespace characters.

Solution

javascript
const cheerio = require('cheerio'); const html = ` <div> <p> Hello World </p> </div> `; const $ = cheerio.load(html); // Problem: Contains a lot of whitespace console.log($('p').text()); // "\n Hello\n World\n " // Solution 1: Use trim() console.log($('p').text().trim()); // "Hello\n World" // Solution 2: Use regex replacement console.log($('p').text().replace(/\s+/g, ' ').trim()); // "Hello World" // Solution 3: Use normalizeWhitespace option const $2 = cheerio.load(html, { normalizeWhitespace: true }); console.log($2('p').text()); // "Hello World" // Solution 4: Custom cleanup function function cleanText(text) { return text .replace(/[\r\n\t]+/g, ' ') // Replace newlines and tabs .replace(/\s+/g, ' ') // Merge multiple spaces .trim(); // Remove leading and trailing spaces } console.log(cleanText($('p').text())); // "Hello World"

10. XML Parsing Issues

Problem Description

Problems occur when parsing XML documents.

Solution

javascript
const cheerio = require('cheerio'); const xml = ` <root> <item id="1"> <name>Item 1</name> </item> <item id="2"> <name>Item 2</name> </item> </root> `; // Solution: Use XML mode const $ = cheerio.load(xml, { xmlMode: true, decodeEntities: false }); // Extract data const items = []; $('item').each((i, el) => { items.push({ id: $(el).attr('id'), name: $(el).find('name').text() }); }); console.log(items);

By mastering the solutions to these common problems, you can use Cheerio more effectively for HTML/XML parsing and data extraction.

标签:NodeJSCheerio