Cheerio supports a plugin system that can extend its functionality through plugins. Here's a complete guide to Cheerio plugin development:
1. Cheerio Plugin Basics
Plugin Structure
A Cheerio plugin is essentially a function that receives a Cheerio instance as a parameter and extends its prototype:
javascript// Basic plugin structure module.exports = function(cheerio) { // Extend Cheerio prototype cheerio.prototype.myMethod = function(selector) { // Plugin logic return this; }; };
Simple Plugin Example
javascript// my-plugin.js module.exports = function(cheerio) { // Add a method to get all text content cheerio.prototype.getAllText = function() { return this.map((i, el) => cheerio(el).text()).get(); }; // Add a method to filter empty elements cheerio.prototype.filterEmpty = function() { return this.filter((i, el) => { return cheerio(el).text().trim().length > 0; }); }; };
2. Using Plugins
Installing and Loading Plugins
javascriptconst cheerio = require('cheerio'); const myPlugin = require('./my-plugin'); // Load plugin cheerio.use(myPlugin); // Use methods provided by plugin const $ = cheerio.load('<div><p>Hello</p><p></p></div>'); console.log($('p').getAllText()); // ['Hello'] console.log($('p').filterEmpty().length); // 1
Chaining Calls
javascript// Plugin methods support chaining const result = $('p') .filterEmpty() .getAllText() .map(text => text.toUpperCase());
3. Practical Plugin Examples
1. Text Cleaning Plugin
javascript// text-cleaner.js module.exports = function(cheerio) { // Clean extra whitespace in text cheerio.prototype.cleanText = function() { return this.each((i, el) => { const $el = cheerio(el); const text = $el.text() .replace(/\s+/g, ' ') .trim(); $el.text(text); }); }; // Remove specified tags cheerio.prototype.removeTags = function(tags) { const tagArray = Array.isArray(tags) ? tags : [tags]; return this.each((i, el) => { const $el = cheerio(el); tagArray.forEach(tag => { $el.find(tag).remove(); }); }); }; };
2. Data Extraction Plugin
javascript// data-extractor.js module.exports = function(cheerio) { // Extract table data as 2D array cheerio.prototype.tableToArray = function() { const result = []; this.find('tr').each((i, row) => { const rowData = []; cheerio(row).find('td, th').each((j, cell) => { rowData.push(cheerio(cell).text().trim()); }); result.push(rowData); }); return result; }; // Extract table data as object array cheerio.prototype.tableToObjects = function() { const $ = cheerio(this); const headers = []; const result = []; // Extract headers $find('thead th, tr:first-child td').each((i, th) => { headers.push(cheerio(th).text().trim()); }); // Extract data rows $find('tbody tr, tr:not(:first-child)').each((i, row) => { const obj = {}; cheerio(row).find('td').each((j, td) => { const key = headers[j] || `col_${j}`; obj[key] = cheerio(td).text().trim(); }); result.push(obj); }); return result; }; };
3. URL Handling Plugin
javascript// url-handler.js const { URL } = require('url'); module.exports = function(cheerio) { // Convert relative URLs to absolute URLs cheerio.prototype.resolveUrls = function(baseUrl) { return this.each((i, el) => { const $el = cheerio(el); const href = $el.attr('href'); const src = $el.attr('src'); if (href) { $el.attr('href', new URL(href, baseUrl).href); } if (src) { $el.attr('src', new URL(src, baseUrl).href); } }); }; // Extract all links cheerio.prototype.extractLinks = function() { const links = []; this.find('a[href]').each((i, el) => { const $el = cheerio(el); links.push({ text: $el.text().trim(), href: $el.attr('href'), title: $el.attr('title') }); }); return links; }; };
4. Image Handling Plugin
javascript// image-handler.js module.exports = function(cheerio) { // Extract all image information cheerio.prototype.extractImages = function() { const images = []; this.find('img').each((i, el) => { const $el = cheerio(el); images.push({ src: $el.attr('src'), alt: $el.attr('alt'), title: $el.attr('title'), width: $el.attr('width'), height: $el.attr('height') }); }); return images; }; // Lazy-loaded image handling cheerio.prototype.handleLazyImages = function() { return this.each((i, el) => { const $el = cheerio(el); const dataSrc = $el.attr('data-src'); if (dataSrc) { $el.attr('src', dataSrc); } }); }; };
4. Advanced Plugin Development
Configurable Plugin
javascript// configurable-plugin.js module.exports = function(cheerio, options = {}) { const defaultOptions = { trim: true, removeEmpty: true, maxLength: 1000 }; const opts = { ...defaultOptions, ...options }; cheerio.prototype.smartExtract = function() { return this.map((i, el) => { let text = cheerio(el).text(); if (opts.trim) { text = text.trim(); } if (opts.removeEmpty && text.length === 0) { return null; } if (opts.maxLength && text.length > opts.maxLength) { text = text.substring(0, opts.maxLength) + '...'; } return text; }).filter(text => text !== null); }; }; // Usage cheerio.use(configurablePlugin, { trim: true, removeEmpty: true, maxLength: 500 });
Async Plugin
javascript// async-plugin.js module.exports = function(cheerio) { cheerio.prototype.fetchContent = async function(url) { const axios = require('axios'); const response = await axios.get(url); return cheerio.load(response.data); }; cheerio.prototype.batchProcess = async function(processor) { const results = []; for (let i = 0; i < this.length; i++) { const result = await processor(cheerio(this[i])); results.push(result); } return results; }; };
5. Plugin Publishing
package.json Configuration
json{ "name": "cheerio-my-plugin", "version": "1.0.0", "description": "My Cheerio plugin", "main": "index.js", "keywords": [ "cheerio", "plugin", "html", "parser" ], "peerDependencies": { "cheerio": ">=1.0.0" }, "repository": { "type": "git", "url": "https://github.com/username/cheerio-my-plugin" }, "license": "MIT" }
Plugin Testing
javascript// test.js const cheerio = require('cheerio'); const myPlugin = require('./my-plugin'); describe('My Plugin', () => { beforeEach(() => { cheerio.use(myPlugin); }); test('should filter empty elements', () => { const $ = cheerio.load('<div><p>Hello</p><p></p></div>'); const result = $('p').filterEmpty(); expect(result.length).toBe(1); expect(result.text()).toBe('Hello'); }); test('should get all text', () => { const $ = cheerio.load('<div><p>Hello</p><p>World</p></div>'); const result = $('p').getAllText(); expect(result).toEqual(['Hello', 'World']); }); });
6. Existing Popular Plugins
cheerio-tableparser
javascriptconst cheerio = require('cheerio'); const tableParser = require('cheerio-tableparser'); cheerio.use(tableParser); const $ = cheerio.load(html); const tableData = $('table').parsetable();
cheerio-select
javascriptconst cheerio = require('cheerio'); const select = require('cheerio-select'); cheerio.use(select); const $ = cheerio.load(html); const elements = $.select('div > p:first-child');
7. Best Practices
1. Naming Conventions
javascript// ✅ Good naming cheerio.prototype.extractLinks = function() {} cheerio.prototype.cleanText = function() {} // ❌ Bad naming cheerio.prototype.doSomething = function() {} cheerio.prototype.method1 = function() {}
2. Return Value Handling
javascript// ✅ Support chaining cheerio.prototype.myMethod = function() { // Processing logic return this; }; // ✅ Return new collection cheerio.prototype.myFilter = function() { const filtered = this.filter(/* condition */); return filtered; };
3. Error Handling
javascriptcheerio.prototype.safeExtract = function() { try { // Extraction logic return this.map((i, el) => { return cheerio(el).text(); }).get(); } catch (error) { console.error('Extraction failed:', error); return []; } };
4. Documentation and Comments
javascript/** * Extract all link information * @param {Object} options - Configuration options * @param {boolean} options.resolveAbsolute - Whether to convert to absolute URL * @param {string} options.baseUrl - Base URL * @returns {Array} Array of link information */ cheerio.prototype.extractLinks = function(options = {}) { // Implementation };
By developing and using Cheerio plugins, you can greatly extend Cheerio's functionality and improve development efficiency.