Cheerio 支持插件系统,可以通过插件扩展其功能。以下是 Cheerio 插件开发的完整指南:
1. Cheerio 插件基础
插件结构
Cheerio 插件本质上是一个函数,它接收 Cheerio 实例作为参数,并扩展其原型:
javascript// 基本插件结构 module.exports = function(cheerio) { // 扩展 Cheerio 原型 cheerio.prototype.myMethod = function(selector) { // 插件逻辑 return this; }; };
简单插件示例
javascript// my-plugin.js module.exports = function(cheerio) { // 添加一个获取所有文本内容的方法 cheerio.prototype.getAllText = function() { return this.map((i, el) => cheerio(el).text()).get(); }; // 添加一个过滤空元素的方法 cheerio.prototype.filterEmpty = function() { return this.filter((i, el) => { return cheerio(el).text().trim().length > 0; }); }; };
2. 使用插件
安装和加载插件
javascriptconst cheerio = require('cheerio'); const myPlugin = require('./my-plugin'); // 加载插件 cheerio.use(myPlugin); // 使用插件提供的方法 const $ = cheerio.load('<div><p>Hello</p><p></p></div>'); console.log($('p').getAllText()); // ['Hello'] console.log($('p').filterEmpty().length); // 1
链式调用
javascript// 插件方法支持链式调用 const result = $('p') .filterEmpty() .getAllText() .map(text => text.toUpperCase());
3. 实用插件示例
1. 文本清理插件
javascript// text-cleaner.js module.exports = function(cheerio) { // 清理文本中的多余空白 cheerio.prototype.cleanText = function() { return this.each((i, el) => { const $el = cheerio(el); const text = $el.text() .replace(/\s+/g, ' ') .trim(); $el.text(text); }); }; // 移除指定标签 cheerio.prototype.removeTags = function(tags) { const tagArray = Array.isArray(tags) ? tags : [tags]; return this.each((i, el) => { const $el = cheerio(el); tagArray.forEach(tag => { $el.find(tag).remove(); }); }); }; };
2. 数据提取插件
javascript// data-extractor.js module.exports = function(cheerio) { // 提取表格数据为二维数组 cheerio.prototype.tableToArray = function() { const result = []; this.find('tr').each((i, row) => { const rowData = []; cheerio(row).find('td, th').each((j, cell) => { rowData.push(cheerio(cell).text().trim()); }); result.push(rowData); }); return result; }; // 提取表格数据为对象数组 cheerio.prototype.tableToObjects = function() { const $ = cheerio(this); const headers = []; const result = []; // 提取表头 $.find('thead th, tr:first-child td').each((i, th) => { headers.push(cheerio(th).text().trim()); }); // 提取数据行 $find('tbody tr, tr:not(:first-child)').each((i, row) => { const obj = {}; cheerio(row).find('td').each((j, td) => { const key = headers[j] || `col_${j}`; obj[key] = cheerio(td).text().trim(); }); result.push(obj); }); return result; }; };
3. URL 处理插件
javascript// url-handler.js const { URL } = require('url'); module.exports = function(cheerio) { // 将相对 URL 转换为绝对 URL cheerio.prototype.resolveUrls = function(baseUrl) { return this.each((i, el) => { const $el = cheerio(el); const href = $el.attr('href'); const src = $el.attr('src'); if (href) { $el.attr('href', new URL(href, baseUrl).href); } if (src) { $el.attr('src', new URL(src, baseUrl).href); } }); }; // 提取所有链接 cheerio.prototype.extractLinks = function() { const links = []; this.find('a[href]').each((i, el) => { const $el = cheerio(el); links.push({ text: $el.text().trim(), href: $el.attr('href'), title: $el.attr('title') }); }); return links; }; };
4. 图片处理插件
javascript// image-handler.js module.exports = function(cheerio) { // 提取所有图片信息 cheerio.prototype.extractImages = function() { const images = []; this.find('img').each((i, el) => { const $el = cheerio(el); images.push({ src: $el.attr('src'), alt: $el.attr('alt'), title: $el.attr('title'), width: $el.attr('width'), height: $el.attr('height') }); }); return images; }; // 懒加载图片处理 cheerio.prototype.handleLazyImages = function() { return this.each((i, el) => { const $el = cheerio(el); const dataSrc = $el.attr('data-src'); if (dataSrc) { $el.attr('src', dataSrc); } }); }; };
4. 高级插件开发
带配置的插件
javascript// configurable-plugin.js module.exports = function(cheerio, options = {}) { const defaultOptions = { trim: true, removeEmpty: true, maxLength: 1000 }; const opts = { ...defaultOptions, ...options }; cheerio.prototype.smartExtract = function() { return this.map((i, el) => { let text = cheerio(el).text(); if (opts.trim) { text = text.trim(); } if (opts.removeEmpty && text.length === 0) { return null; } if (opts.maxLength && text.length > opts.maxLength) { text = text.substring(0, opts.maxLength) + '...'; } return text; }).filter(text => text !== null); }; }; // 使用 cheerio.use(configurablePlugin, { trim: true, removeEmpty: true, maxLength: 500 });
异步插件
javascript// async-plugin.js module.exports = function(cheerio) { cheerio.prototype.fetchContent = async function(url) { const axios = require('axios'); const response = await axios.get(url); return cheerio.load(response.data); }; cheerio.prototype.batchProcess = async function(processor) { const results = []; for (let i = 0; i < this.length; i++) { const result = await processor(cheerio(this[i])); results.push(result); } return results; }; };
5. 插件发布
package.json 配置
json{ "name": "cheerio-my-plugin", "version": "1.0.0", "description": "My Cheerio plugin", "main": "index.js", "keywords": [ "cheerio", "plugin", "html", "parser" ], "peerDependencies": { "cheerio": ">=1.0.0" }, "repository": { "type": "git", "url": "https://github.com/username/cheerio-my-plugin" }, "license": "MIT" }
插件测试
javascript// test.js const cheerio = require('cheerio'); const myPlugin = require('./my-plugin'); describe('My Plugin', () => { beforeEach(() => { cheerio.use(myPlugin); }); test('should filter empty elements', () => { const $ = cheerio.load('<div><p>Hello</p><p></p></div>'); const result = $('p').filterEmpty(); expect(result.length).toBe(1); expect(result.text()).toBe('Hello'); }); test('should get all text', () => { const $ = cheerio.load('<div><p>Hello</p><p>World</p></div>'); const result = $('p').getAllText(); expect(result).toEqual(['Hello', 'World']); }); });
6. 现有流行插件
cheerio-tableparser
javascriptconst cheerio = require('cheerio'); const tableParser = require('cheerio-tableparser'); cheerio.use(tableParser); const $ = cheerio.load(html); const tableData = $('table').parsetable();
cheerio-select
javascriptconst cheerio = require('cheerio'); const select = require('cheerio-select'); cheerio.use(select); const $ = cheerio.load(html); const elements = $.select('div > p:first-child');
7. 最佳实践
1. 命名规范
javascript// ✅ 好的命名 cheerio.prototype.extractLinks = function() {} cheerio.prototype.cleanText = function() {} // ❌ 不好的命名 cheerio.prototype.doSomething = function() {} cheerio.prototype.method1 = function() {}
2. 返回值处理
javascript// ✅ 支持链式调用 cheerio.prototype.myMethod = function() { // 处理逻辑 return this; }; // ✅ 返回新集合 cheerio.prototype.myFilter = function() { const filtered = this.filter(/* 条件 */); return filtered; };
3. 错误处理
javascriptcheerio.prototype.safeExtract = function() { try { // 提取逻辑 return this.map((i, el) => { return cheerio(el).text(); }).get(); } catch (error) { console.error('Extraction failed:', error); return []; } };
4. 文档和注释
javascript/** * 提取所有链接信息 * @param {Object} options - 配置选项 * @param {boolean} options.resolveAbsolute - 是否转换为绝对 URL * @param {string} options.baseUrl - 基础 URL * @returns {Array} 链接信息数组 */ cheerio.prototype.extractLinks = function(options = {}) { // 实现 };
通过开发和使用 Cheerio 插件,可以大大扩展 Cheerio 的功能,提高开发效率。