Puppeteer has unique advantages when handling dynamic web pages and Single Page Applications (SPAs), capable of executing JavaScript, waiting for async loading, handling route changes, and more.
1. Handling Dynamic Content Loading
Wait for Element to Appear:
javascriptconst puppeteer = require('puppeteer'); async function scrapeDynamicContent() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com'); // Wait for dynamically loaded element await page.waitForSelector('.dynamic-content', { visible: true }); const content = await page.$eval('.dynamic-content', el => el.textContent); console.log(content); await browser.close(); } scrapeDynamicContent();
Wait for Specific Condition:
javascriptawait page.waitForFunction(() => { return document.querySelectorAll('.item').length > 0; });
Wait for Network Requests to Complete:
javascriptawait page.goto('https://example.com', { waitUntil: 'networkidle2' });
2. Handling Infinite Scroll
Basic Infinite Scroll:
javascriptasync function scrapeInfiniteScroll() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com/infinite-scroll'); const items = []; let previousHeight = 0; while (true) { // Scroll to bottom await page.evaluate(() => { window.scrollBy(0, window.innerHeight); }); // Wait for new content to load await page.waitForTimeout(1000); // Check if there's new content const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight === previousHeight) { break; // No new content } previousHeight = currentHeight; // Collect data const newItems = await page.$$eval('.item', elements => { return elements.map(el => el.textContent); }); items.push(...newItems); } await browser.close(); return items; }
Optimized Infinite Scroll:
javascriptasync function scrapeInfiniteScrollOptimized() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com/infinite-scroll'); const items = []; let noNewItemsCount = 0; while (noNewItemsCount < 3) { // Stop after 3 consecutive times with no new items const itemCountBefore = items.length; // Scroll to bottom await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight); }); // Wait for loading indicator to disappear try { await page.waitForSelector('.loading', { hidden: true, timeout: 3000 }); } catch (error) { // Loading indicator might not exist } // Collect new data const newItems = await page.$$eval('.item', elements => { return elements.map(el => el.textContent); }); if (newItems.length === itemCountBefore) { noNewItemsCount++; } else { noNewItemsCount = 0; items.push(...newItems); } } await browser.close(); return items; }
3. Handling SPA Routes
Listen to Route Changes:
javascriptasync function handleSPARoutes() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com'); // Listen to route changes page.on('framenavigated', async (frame) => { console.log('Navigated to:', frame.url()); // Wait for page content to load await frame.waitForSelector('.content'); const title = await frame.$eval('.content', el => el.textContent); console.log('Page title:', title); }); // Click navigation links await page.click('#about-link'); await page.waitForTimeout(1000); await page.click('#contact-link'); await page.waitForTimeout(1000); await browser.close(); }
Wait for Specific Route:
javascriptasync function waitForRoute(page, path) { return new Promise((resolve) => { const checkRoute = async () => { const currentPath = await page.evaluate(() => window.location.pathname); if (currentPath === path) { resolve(); } else { setTimeout(checkRoute, 100); } }; checkRoute(); }); } // Usage await page.click('#about-link'); await waitForRoute(page, '/about');
4. Handling AJAX Requests
Wait for Specific API Response:
javascriptasync function waitForAPIResponse(page, urlPattern) { return new Promise((resolve) => { page.on('response', (response) => { if (response.url().includes(urlPattern)) { resolve(response); } }); }); } // Usage const apiResponse = await Promise.all([ waitForAPIResponse(page, '/api/data'), page.click('#load-data-button') ]); const data = await apiResponse.json(); console.log(data);
Intercept and Modify API Requests:
javascriptawait page.setRequestInterception(true); page.on('request', (request) => { if (request.url().includes('/api/data')) { // Modify request request.continue({ headers: { ...request.headers(), 'Authorization': 'Bearer token' } }); } else { request.continue(); } });
5. Handling WebSocket
Listen to WebSocket Messages:
javascriptconst client = await page.target().createCDPSession(); await client.send('Network.enable'); client.on('Network.webSocketFrameReceived', (params) => { console.log('WebSocket message:', params.response.payloadData); }); client.on('Network.webSocketFrameSent', (params) => { console.log('WebSocket sent:', params.response.payloadData); });
6. Handling Client-Side Rendering
Wait for Client Rendering to Complete:
javascriptasync function waitForClientRendering(page) { // Method 1: Wait for specific element await page.waitForSelector('.rendered-content'); // Method 2: Wait for rendering flag await page.waitForFunction(() => { return window.__RENDER_COMPLETE__ === true; }); // Method 3: Wait for network idle await page.waitForFunction(() => { return performance.getEntriesByType('resource').length > 0; }); }
Handle React/Vue Apps:
javascriptasync function scrapeReactApp() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com/react-app'); // Wait for React app to mount await page.waitForSelector('#root'); // Wait for data loading to complete await page.waitForFunction(() => { return window.__INITIAL_STATE__?.loaded === true; }); // Interact with React app await page.click('#load-more-button'); await page.waitForSelector('.new-items'); const items = await page.$$eval('.item', elements => { return elements.map(el => el.textContent); }); await browser.close(); return items; }
7. Practical Use Cases
Use Case 1: Scraping Social Media Dynamic Content
javascriptasync function scrapeSocialMediaPosts(username) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(`https://social-media.com/${username}`); const posts = []; // Scroll to load more posts while (posts.length < 50) { // Scroll to bottom await page.evaluate(() => { window.scrollBy(0, window.innerHeight); }); // Wait for new posts to load await page.waitForTimeout(2000); // Collect post data const newPosts = await page.$$eval('.post', elements => { return elements.map(post => ({ id: post.dataset.id, content: post.querySelector('.content')?.textContent, likes: post.querySelector('.likes')?.textContent, timestamp: post.querySelector('.timestamp')?.textContent })); }); // Only add new posts const newPostIds = new Set(posts.map(p => p.id)); const uniqueNewPosts = newPosts.filter(p => !newPostIds.has(p.id)); posts.push(...uniqueNewPosts); } await browser.close(); return posts; }
Use Case 2: Scraping E-commerce Product List
javascriptasync function scrapeEcommerceProducts(categoryUrl) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(categoryUrl); const products = []; while (true) { // Wait for products to load await page.waitForSelector('.product-card'); // Collect current page products const pageProducts = await page.$$eval('.product-card', cards => { return cards.map(card => ({ id: card.dataset.id, title: card.querySelector('.title')?.textContent, price: card.querySelector('.price')?.textContent, rating: card.querySelector('.rating')?.textContent })); }); products.push(...pageProducts); // Check if there's a next page const nextButton = await page.$('.next-page:not(.disabled)'); if (!nextButton) { break; } // Click next page await nextButton.click(); await page.waitForTimeout(1000); } await browser.close(); return products; }
Use Case 3: Scraping Real-time Data Updates
javascriptasync function scrapeRealTimeData(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url); const dataUpdates = []; // Listen to DOM changes await page.evaluate(() => { const observer = new MutationObserver((mutations) => { mutations.forEach((mutation) => { if (mutation.type === 'childList') { window.__DATA_UPDATES__ = window.__DATA_UPDATES__ || []; window.__DATA_UPDATES__.push({ timestamp: Date.now(), addedNodes: mutation.addedNodes.length }); } }); }); observer.observe(document.body, { childList: true, subtree: true }); }); // Wait for some time to collect data await page.waitForTimeout(30000); // Get collected data const updates = await page.evaluate(() => { return window.__DATA_UPDATES__ || []; }); await browser.close(); return updates; }
8. Best Practices
1. Use Appropriate Wait Strategies:
javascript// Prefer waitForSelector await page.waitForSelector('.element'); // Use waitForFunction for complex conditions await page.waitForFunction(() => { return document.querySelectorAll('.item').length > 10; }); // Use waitForResponse for network requests await page.waitForResponse(response => response.url().includes('/api/data') );
2. Avoid Hardcoded Wait Times:
javascript// Bad practice await page.waitForTimeout(5000); // Good practice await page.waitForSelector('.loaded-content');
3. Handle Loading Failures:
javascripttry { await page.waitForSelector('.content', { timeout: 10000 }); } catch (error) { console.log('Content failed to load, using fallback'); // Use fallback strategy }
4. Optimize Performance:
javascript// Disable unnecessary resources await page.setRequestInterception(true); page.on('request', (request) => { if (['image', 'font', 'media'].includes(request.resourceType())) { request.abort(); } else { request.continue(); } });
5. Handle Anti-Scraping:
javascript// Set realistic user agent await page.setUserAgent('Mozilla/5.0 ...'); // Add random delays const randomDelay = () => Math.random() * 2000 + 1000; await page.waitForTimeout(randomDelay()); // Simulate human behavior await page.evaluate(() => { window.scrollBy(0, Math.random() * 500); });