What are the common problems when using Cheerio? How to solve these problems? - 面试题

Cheerio provides a rich API, but developers often encounter some common issues during actual use. Here are common problems and their solutions when using Cheerio:

1. Chinese Character Encoding Issues

Problem Description

When scraping web pages containing Chinese characters, garbled text appears.

Solution

javascript
const axios = require('axios');
const cheerio = require('cheerio');
const iconv = require('iconv-lite');

async function scrapeWithEncoding(url) {
  // Solution 1: Set response type to arraybuffer
  const response = await axios.get(url, {
    responseType: 'arraybuffer',
    headers: {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
  });
  
  // Solution 2: Detect encoding and convert
  let html = response.data;
  
  // Detect encoding from Content-Type
  const contentType = response.headers['content-type'] || '';
  const charsetMatch = contentType.match(/charset=([^;]+)/i);
  
  if (charsetMatch) {
    const charset = charsetMatch[1].toLowerCase();
    if (charset !== 'utf-8') {
      html = iconv.decode(Buffer.from(html), charset);
    }
  }
  
  // Solution 3: Get encoding from HTML meta tag
  const $temp = cheerio.load(html);
  const metaCharset = $temp('meta[charset]').attr('charset');
  if (metaCharset && metaCharset.toLowerCase() !== 'utf-8') {
    html = iconv.decode(Buffer.from(html), metaCharset);
  }
  
  const $ = cheerio.load(html);
  return $('title').text();
}

2. Selector Not Finding Elements

Problem Description

When using selectors to query, empty results are returned, but the elements actually exist.

Solution

javascript
const cheerio = require('cheerio');

const html = `
  <div class="container">
    <p class="text">Hello</p>
  </div>
`;

const $ = cheerio.load(html);

// Problem: Wrong selector
console.log($('.container p.text').length); // 1

// Solution 1: Check selector syntax
console.log($('.container > p.text').length); // 1

// Solution 2: Use more relaxed selector
console.log($('.container .text').length); // 1

// Solution 3: Debug step by step
console.log($('.container').length); // 1
console.log($('.container p').length); // 1
console.log($('.container p').hasClass('text')); // true

// Solution 4: Use contains() to find elements containing text
console.log($('p:contains("Hello")').length); // 1

// Solution 5: Check if HTML is loaded correctly
console.log($.html()); // View complete HTML

3. Unable to Get Dynamic Content

Problem Description

Content dynamically loaded by JavaScript on the page cannot be retrieved.

Solution

javascript
const cheerio = require('cheerio');
const axios = require('axios');

// Problem: Cannot get dynamic content directly with Cheerio
async function scrapeStatic() {
  const response = await axios.get('https://example.com/dynamic');
  const $ = cheerio.load(response.data);
  console.log($('.dynamic-content').text()); // Empty
}

// Solution: Combine with Puppeteer
const puppeteer = require('puppeteer');

async function scrapeDynamic() {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  await page.goto('https://example.com/dynamic');
  
  // Wait for dynamic content to load
  await page.waitForSelector('.dynamic-content');
  
  const html = await page.content();
  await browser.close();
  
  const $ = cheerio.load(html);
  console.log($('.dynamic-content').text()); // Has content
}

// Solution: Call API directly
async function scrapeAPI() {
  const response = await axios.get('https://example.com/api/data');
  const data = response.data;
  console.log(data); // Get JSON data directly
}

4. High Memory Usage

Problem Description

When processing large amounts of HTML, memory usage is too high, causing the program to crash.

Solution

javascript
const cheerio = require('cheerio');

// Problem: Process large file at once
function processLargeFileBad(html) {
  const $ = cheerio.load(html);
  const results = [];
  
  // Process millions of elements
  $('.item').each((i, el) => {
    results.push({
      title: $(el).find('.title').text(),
      content: $(el).find('.content').text()
    });
  });
  
  return results;
}

// Solution 1: Process in batches
function processLargeFileGood(html) {
  const $ = cheerio.load(html);
  const batchSize = 1000;
  const total = $('.item').length;
  const results = [];
  
  for (let i = 0; i < total; i += batchSize) {
    const $batch = $('.item').slice(i, i + batchSize);
    const batchData = $batch.map((j, el) => ({
      title: $(el).find('.title').text(),
      content: $(el).find('.content').text()
    })).get();
    
    results.push(...batchData);
    
    // Clean up promptly
    $batch = null;
    
    // Force garbage collection (development environment)
    if (global.gc) {
      global.gc();
    }
  }
  
  return results;
}

// Solution 2: Use stream processing
const fs = require('fs');
const { Transform } = require('stream');

function processWithStream(filePath) {
  return new Promise((resolve, reject) => {
    const results = [];
    let buffer = '';
    
    const transformStream = new Transform({
      transform(chunk, encoding, callback) {
        buffer += chunk.toString();
        
        // Split and process by tags
        const items = buffer.match(/<item[^>]*>[\s\S]*?<\/item>/g) || [];
        
        items.forEach(item => {
          const $ = cheerio.load(item);
          results.push({
            title: $('.title').text(),
            content: $('.content').text()
          });
        });
        
        // Clean up processed content
        const lastIndex = buffer.lastIndexOf('</item>');
        if (lastIndex !== -1) {
          buffer = buffer.slice(lastIndex + 7);
        }
        
        callback();
      },
      
      flush(callback) {
        resolve(results);
        callback();
      }
    });
    
    fs.createReadStream(filePath)
      .pipe(transformStream)
      .on('error', reject);
  });
}

5. Relative Path Handling Issues

Problem Description

Extracted links are relative paths and cannot be accessed directly.

Solution

javascript
const cheerio = require('cheerio');
const { URL } = require('url');

function resolveLinks(html, baseUrl) {
  const $ = cheerio.load(html);
  const links = [];
  
  $('a[href]').each((i, el) => {
    const href = $(el).attr('href');
    const absoluteUrl = new URL(href, baseUrl).href;
    
    links.push({
      text: $(el).text().trim(),
      href: href,
      absoluteUrl: absoluteUrl
    });
  });
  
  return links;
}

// Usage example
const html = `
  <a href="/page1">Page 1</a>
  <a href="../page2">Page 2</a>
  <a href="https://example.com/page3">Page 3</a>
`;

const links = resolveLinks(html, 'https://example.com/dir/index.html');
console.log(links);

6. Form Data Extraction Issues

Problem Description

Encounter complex situations like checkboxes and multi-select boxes when extracting form data.

Solution

javascript
const cheerio = require('cheerio');

function extractFormData(html) {
  const $ = cheerio.load(html);
  const formData = {};
  
  // Text input
  $('input[type="text"]').each((i, el) => {
    const name = $(el).attr('name');
    const value = $(el).val() || '';
    formData[name] = value;
  });
  
  // Checkbox (multiple selection)
  $('input[type="checkbox"]:checked').each((i, el) => {
    const name = $(el).attr('name');
    const value = $(el).val();
    
    if (!formData[name]) {
      formData[name] = [];
    }
    formData[name].push(value);
  });
  
  // Radio button
  $('input[type="radio"]:checked').each((i, el) => {
    const name = $(el).attr('name');
    const value = $(el).val();
    formData[name] = value;
  });
  
  // Dropdown selection
  $('select').each((i, el) => {
    const name = $(el).attr('name');
    const selectedOption = $(el).find('option:selected');
    formData[name] = selectedOption.val();
  });
  
  // Multi-select dropdown
  $('select[multiple]').each((i, el) => {
    const name = $(el).attr('name');
    const selectedOptions = $(el).find('option:selected');
    formData[name] = selectedOptions.map((j, opt) => $(opt).val()).get();
  });
  
  // Textarea
  $('textarea').each((i, el) => {
    const name = $(el).attr('name');
    const value = $(el).val() || '';
    formData[name] = value;
  });
  
  return formData;
}

7. HTML Entity Encoding Issues

Problem Description

Special characters in HTML are encoded, such as  , &, etc.

Solution

javascript
const cheerio = require('cheerio');

const html = '<div>Hello &amp; World &nbsp; Test</div>';

// Problem: Entities are decoded by default
const $ = cheerio.load(html);
console.log($('.div').text()); // "Hello & World   Test"

// Solution 1: Disable entity decoding
const $2 = cheerio.load(html, { decodeEntities: false });
console.log($2('.div').text()); // "Hello &amp; World &nbsp; Test"

// Solution 2: Manually handle entities
const he = require('he');
const text = he.decode($('.div').text());
console.log(text); // "Hello & World   Test"

// Solution 3: Use html() method to get raw HTML
const rawHtml = $('.div').html();
console.log(rawHtml); // "Hello &amp; World &nbsp; Test"

8. Performance Issues

Problem Description

Poor performance when processing large amounts of data.

Solution

javascript
const cheerio = require('cheerio');

// Problem: Using complex selectors
function slowQuery($) {
  return $('div div div p span a').text();
}

// Solution 1: Use more specific selectors
function fastQuery1($) {
  return $('.container .link').text();
}

// Solution 2: Use find() method
function fastQuery2($) {
  return $('.container').find('.link').text();
}

// Solution 3: Cache selector results
function fastQuery3($) {
  const $container = $('.container');
  return $container.find('.link').text();
}

// Solution 4: Use native methods
function fastestQuery($) {
  const container = $('.container')[0];
  return container.querySelector('.link').textContent;
}

9. Whitespace Character Handling

Problem Description

Extracted text contains a lot of whitespace characters.

Solution

javascript
const cheerio = require('cheerio');

const html = `
  <div>
    <p>
      Hello
      World
    </p>
  </div>
`;

const $ = cheerio.load(html);

// Problem: Contains a lot of whitespace
console.log($('p').text()); // "\n      Hello\n      World\n    "

// Solution 1: Use trim()
console.log($('p').text().trim()); // "Hello\n      World"

// Solution 2: Use regex replacement
console.log($('p').text().replace(/\s+/g, ' ').trim()); // "Hello World"

// Solution 3: Use normalizeWhitespace option
const $2 = cheerio.load(html, { normalizeWhitespace: true });
console.log($2('p').text()); // "Hello World"

// Solution 4: Custom cleanup function
function cleanText(text) {
  return text
    .replace(/[\r\n\t]+/g, ' ')  // Replace newlines and tabs
    .replace(/\s+/g, ' ')         // Merge multiple spaces
    .trim();                      // Remove leading and trailing spaces
}

console.log(cleanText($('p').text())); // "Hello World"

10. XML Parsing Issues

Problem Description

Problems occur when parsing XML documents.

Solution

javascript
const cheerio = require('cheerio');

const xml = `
  <root>
    <item id="1">
      <name>Item 1</name>
    </item>
    <item id="2">
      <name>Item 2</name>
    </item>
  </root>
`;

// Solution: Use XML mode
const $ = cheerio.load(xml, { 
  xmlMode: true,
  decodeEntities: false
});

// Extract data
const items = [];
$('item').each((i, el) => {
  items.push({
    id: $(el).attr('id'),
    name: $(el).find('name').text()
  });
});

console.log(items);

By mastering the solutions to these common problems, you can use Cheerio more effectively for HTML/XML parsing and data extraction.