乐闻世界logo
搜索文章和话题

What are the practical application scenarios of Puppeteer in real projects? Please provide examples of web scraping, automated testing, and other specific implementations.

2月19日 19:48

Puppeteer has a wide range of practical applications in real projects, from web scraping to automated testing, from data collection to performance monitoring. Here are some typical real-world use cases.

1. Web Scraping and Data Collection

Case 1: E-commerce Product Price Monitoring

javascript
const puppeteer = require('puppeteer'); async function monitorProductPrices(productUrls) { const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox'] }); const results = []; for (const url of productUrls) { const page = await browser.newPage(); // Set user agent to avoid being detected as a scraper await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); await page.goto(url, { waitUntil: 'networkidle2' }); // Wait for price element to load await page.waitForSelector('.price', { timeout: 5000 }); const productData = await page.evaluate(() => { return { title: document.querySelector('.product-title')?.textContent, price: document.querySelector('.price')?.textContent, availability: document.querySelector('.availability')?.textContent, rating: document.querySelector('.rating')?.textContent }; }); results.push({ url, ...productData, timestamp: new Date().toISOString() }); await page.close(); } await browser.close(); return results; } // Usage example const products = [ 'https://example.com/product/1', 'https://example.com/product/2' ]; monitorProductPrices(products).then(data => { console.log(JSON.stringify(data, null, 2)); });

Case 2: Social Media Data Scraping

javascript
async function scrapeSocialMedia(username) { const browser = await puppeteer.launch({ headless: 'new' }); const page = await browser.newPage(); // Simulate login await page.goto('https://social-media.com/login'); await page.type('#username', 'your_username'); await page.type('#password', 'your_password'); await page.click('#login-button'); await page.waitForNavigation(); // Visit user page await page.goto(`https://social-media.com/${username}`); // Scroll to load more content while (true) { await page.evaluate(() => { window.scrollBy(0, window.innerHeight); }); try { await page.waitForSelector('.new-post', { timeout: 2000 }); } catch { break; } } // Scrape post data const posts = await page.evaluate(() => { return Array.from(document.querySelectorAll('.post')).map(post => ({ content: post.querySelector('.content')?.textContent, likes: post.querySelector('.likes')?.textContent, comments: post.querySelector('.comments')?.textContent, date: post.querySelector('.date')?.textContent })); }); await browser.close(); return posts; }

2. Automated Testing

Case 3: E2E Testing

javascript
const { expect } = require('expect-puppeteer'); async function runE2ETest() { const browser = await puppeteer.launch({ headless: 'new', slowMo: 50 // Slow down operations for observation }); const page = await browser.newPage(); try { // Test user registration flow await page.goto('https://example.com/register'); // Fill registration form await page.type('#username', 'testuser'); await page.type('#email', 'test@example.com'); await page.type('#password', 'password123'); await page.type('#confirm-password', 'password123'); // Submit form await Promise.all([ page.waitForNavigation(), page.click('#register-button') ]); // Verify registration success await expect(page).toMatch('Welcome, testuser!'); // Test login flow await page.click('#logout-button'); await page.waitForNavigation(); await page.type('#login-email', 'test@example.com'); await page.type('#login-password', 'password123'); await page.click('#login-button'); await page.waitForNavigation(); // Verify login success await expect(page).toMatch('Welcome back!'); console.log('E2E test passed!'); } catch (error) { console.error('E2E test failed:', error); // Save failure screenshot await page.screenshot({ path: 'test-failure.png' }); } finally { await browser.close(); } } runE2ETest();

Case 4: Visual Regression Testing

javascript
const fs = require('fs'); const pixelmatch = require('pixelmatch'); const { PNG } = require('pngjs'); async function visualRegressionTest(url, baselinePath) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); // Take current screenshot const screenshot = await page.screenshot(); await browser.close(); // If no baseline image exists, save current screenshot as baseline if (!fs.existsSync(baselinePath)) { fs.writeFileSync(baselinePath, screenshot); console.log('Baseline image created'); return true; } // Read baseline image const baseline = PNG.sync.read(fs.readFileSync(baselinePath)); const current = PNG.sync.read(screenshot); // Compare image differences const diff = new PNG({ width: baseline.width, height: baseline.height }); const numDiffPixels = pixelmatch( baseline.data, current.data, diff.data, baseline.width, baseline.height, { threshold: 0.1 } ); // Save difference image fs.writeFileSync('diff.png', PNG.sync.write(diff)); const totalPixels = baseline.width * baseline.height; const diffPercentage = (numDiffPixels / totalPixels) * 100; console.log(`Difference: ${diffPercentage.toFixed(2)}%`); // If difference exceeds threshold, test fails if (diffPercentage > 0.5) { console.log('Visual regression detected!'); return false; } console.log('Visual regression test passed!'); return true; } visualRegressionTest('https://example.com', 'baseline.png');

3. PDF Generation and Document Processing

Case 5: Dynamic Report Generation

javascript
async function generateReport(data, outputPath) { const browser = await puppeteer.launch(); const page = await browser.newPage(); // Generate HTML report const html = ` <!DOCTYPE html> <html> <head> <style> body { font-family: Arial, sans-serif; padding: 40px; } h1 { color: #333; } table { width: 100%; border-collapse: collapse; margin-top: 20px; } th, td { border: 1px solid #ddd; padding: 12px; text-align: left; } th { background-color: #f2f2f2; } .summary { margin-top: 30px; padding: 20px; background-color: #f9f9f9; } </style> </head> <body> <h1>Sales Report</h1> <p>Generated: ${new Date().toLocaleString()}</p> <table> <thead> <tr> <th>Product</th> <th>Quantity</th> <th>Unit Price</th> <th>Total</th> </tr> </thead> <tbody> ${data.map(item => ` <tr> <td>${item.product}</td> <td>${item.quantity}</td> <td>$${item.price.toFixed(2)}</td> <td>$${(item.quantity * item.price).toFixed(2)}</td> </tr> `).join('')} </tbody> </table> <div class="summary"> <h2>Total: $${data.reduce((sum, item) => sum + item.quantity * item.price, 0).toFixed(2)}</h2> </div> </body> </html> `; await page.setContent(html); // Generate PDF await page.pdf({ path: outputPath, format: 'A4', printBackground: true, margin: { top: '20px', right: '20px', bottom: '20px', left: '20px' } }); await browser.close(); console.log(`Report generated: ${outputPath}`); } // Usage example const salesData = [ { product: 'Product A', quantity: 10, price: 99.99 }, { product: 'Product B', quantity: 5, price: 149.99 }, { product: 'Product C', quantity: 8, price: 79.99 } ]; generateReport(salesData, 'sales-report.pdf');

Case 6: Batch Invoice Generation

javascript
async function generateInvoices(invoices) { const browser = await puppeteer.launch(); const page = await browser.newPage(); for (const invoice of invoices) { const html = ` <!DOCTYPE html> <html> <head> <style> body { font-family: Arial, sans-serif; padding: 40px; } .header { text-align: center; margin-bottom: 40px; } .invoice-info { margin-bottom: 30px; } table { width: 100%; border-collapse: collapse; } th, td { border: 1px solid #ddd; padding: 10px; text-align: left; } th { background-color: #f2f2f2; } .total { text-align: right; font-weight: bold; margin-top: 20px; } </style> </head> <body> <div class="header"> <h1>Invoice</h1> <p>Invoice #: ${invoice.number}</p> </div> <div class="invoice-info"> <p>Date: ${invoice.date}</p> <p>Customer: ${invoice.customer}</p> </div> <table> <thead> <tr> <th>Item</th> <th>Quantity</th> <th>Unit Price</th> <th>Total</th> </tr> </thead> <tbody> ${invoice.items.map(item => ` <tr> <td>${item.name}</td> <td>${item.quantity}</td> <td>$${item.price}</td> <td>$${item.quantity * item.price}</td> </tr> `).join('')} </tbody> </table> <div class="total"> Total: $${invoice.total} </div> </body> </html> `; await page.setContent(html); await page.pdf({ path: `invoices/invoice-${invoice.number}.pdf`, format: 'A4', printBackground: true }); console.log(`Generated invoice: ${invoice.number}`); } await browser.close(); } // Usage example const invoices = [ { number: 'INV-001', date: '2024-01-15', customer: 'Customer A', items: [ { name: 'Service A', quantity: 1, price: 500 }, { name: 'Service B', quantity: 2, price: 300 } ], total: 1100 } ]; generateInvoices(invoices);

4. Performance Monitoring and Analysis

Case 7: Page Performance Analysis

javascript
async function analyzePagePerformance(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); // Enable performance monitoring const client = await page.target().createCDPSession(); await client.send('Performance.enable'); await client.send('Network.enable'); // Record start time const startTime = Date.now(); await page.goto(url, { waitUntil: 'networkidle2' }); const loadTime = Date.now() - startTime; // Get performance metrics const metrics = await client.send('Performance.getMetrics'); // Get key performance indicators const performanceData = { loadTime, domContentLoaded: await page.evaluate(() => performance.timing.domContentLoadedEventEnd - performance.timing.navigationStart ), firstPaint: await page.evaluate(() => performance.getEntriesByType('paint')[0]?.startTime ), firstContentfulPaint: await page.evaluate(() => performance.getEntriesByType('paint')[1]?.startTime ), resources: metrics.metrics }; // Generate performance report console.log('Performance Report:'); console.log(`Load Time: ${performanceData.loadTime}ms`); console.log(`DOM Content Loaded: ${performanceData.domContentLoaded}ms`); console.log(`First Paint: ${performanceData.firstPaint}ms`); console.log(`First Contentful Paint: ${performanceData.firstContentfulPaint}ms`); await browser.close(); return performanceData; } analyzePagePerformance('https://example.com');

5. SEO Tools

Case 8: SEO Audit Tool

javascript
async function seoAudit(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); const seoData = await page.evaluate(() => { const issues = []; const warnings = []; // Check title const title = document.querySelector('title'); if (!title) { issues.push('Missing title tag'); } else if (title.textContent.length > 60) { warnings.push('Title too long (> 60 characters)'); } // Check description const description = document.querySelector('meta[name="description"]'); if (!description) { issues.push('Missing meta description'); } else if (description.content.length > 160) { warnings.push('Meta description too long (> 160 characters)'); } // Check H1 tags const h1Tags = document.querySelectorAll('h1'); if (h1Tags.length === 0) { issues.push('Missing H1 tag'); } else if (h1Tags.length > 1) { warnings.push('Multiple H1 tags found'); } // Check image alt attributes const images = document.querySelectorAll('img'); let missingAlt = 0; images.forEach(img => { if (!img.alt) missingAlt++; }); if (missingAlt > 0) { warnings.push(`${missingAlt} images missing alt attributes`); } // Check links const links = document.querySelectorAll('a[href]'); let brokenLinks = 0; links.forEach(link => { if (link.getAttribute('href').startsWith('#')) brokenLinks++; }); return { title: title?.textContent, description: description?.content, h1Count: h1Tags.length, imageCount: images.length, linkCount: links.length, issues, warnings }; }); console.log('SEO Audit Results:'); console.log(JSON.stringify(seoData, null, 2)); await browser.close(); return seoData; } seoAudit('https://example.com');

6. Best Practices Summary

1. Error Handling:

javascript
try { // Operation code } catch (error) { console.error('Error:', error); // Save error screenshot await page.screenshot({ path: 'error.png' }); } finally { await browser.close(); }

2. Resource Management:

javascript
// Clean up resources promptly await page.close(); await browser.close();

3. Performance Optimization:

javascript
// Disable unnecessary resources await page.setRequestInterception(true); page.on('request', (request) => { if (['image', 'font'].includes(request.resourceType())) { request.abort(); } else { request.continue(); } });

4. Anti-Scraping Strategies:

javascript
// Set realistic user agent await page.setUserAgent('Mozilla/5.0 ...'); // Add delays await new Promise(resolve => setTimeout(resolve, 1000)); // Use proxy const browser = await puppeteer.launch({ args: ['--proxy-server=http://proxy.example.com:8080'] });
标签:Puppeteer