const puppeteer = require('puppeteer'); const fs = require('fs'); /** * Xiaozhu Inspector - Inspect page structure without login * This will help identify the correct CSS selectors */ async function inspectXiaozhu() { console.log('πŸ” Launching browser to inspect Xiaozhu...'); const browser = await puppeteer.launch({ headless: "new", // Run in headless mode for WSL compatibility defaultViewport: { width: 1920, height: 1080 }, args: ['--no-sandbox', '--disable-setuid-sandbox'] // Required for WSL }); const page = await browser.newPage(); // Try different URL patterns const searchUrls = [ 'https://www.xiaozhu.com/search-shanghai-εΎζ±‡εŒΊ/', 'https://www.xiaozhu.com/search/shanghai/', 'https://www.xiaozhu.com/shanghai/', 'https://www.xiaozhu.com/' ]; for (const url of searchUrls) { console.log(`\nπŸ“‘ Trying: ${url}`); try { await page.goto(url, { waitUntil: 'networkidle2', timeout: 15000 }); console.log(`βœ… Loaded: ${page.url()}`); // Wait a bit for dynamic content await new Promise(resolve => setTimeout(resolve, 3000)); // Take screenshot const screenshotPath = `./xiaozhu_screenshot_${Date.now()}.png`; await page.screenshot({ path: screenshotPath, fullPage: true }); console.log(`πŸ“Έ Screenshot saved: ${screenshotPath}`); // Extract page structure const pageInfo = await page.evaluate(() => { const info = { title: document.title, url: window.location.href, bodyClasses: document.body.className, // Try to find common patterns for listing containers possibleContainers: [], possibleListingCards: [], // Look for elements that might be listings allClasses: new Set(), allIds: new Set() }; // Collect all classes and IDs document.querySelectorAll('*').forEach(el => { if (el.className && typeof el.className === 'string') { el.className.split(' ').forEach(cls => { if (cls) info.allClasses.add(cls); }); } if (el.id) info.allIds.add(el.id); }); // Look for elements that might contain listings const possibleSelectors = [ '.list', '.listing', '.result', '.item', '.card', '[class*="list"]', '[class*="result"]', '[class*="house"]', '[class*="room"]', '[class*="apartment"]' ]; possibleSelectors.forEach(selector => { try { const elements = document.querySelectorAll(selector); if (elements.length > 0 && elements.length < 100) { info.possibleContainers.push({ selector: selector, count: elements.length, sample: elements[0]?.className || elements[0]?.id || 'no class/id' }); } } catch (e) {} }); // Try to extract any visible listings const extractedListings = []; // Common patterns const cardSelectors = [ '.pho_item', '.room_box', '.result_list li', '.house_item', '[class*="card"]', '[class*="item"]' ]; for (const sel of cardSelectors) { try { const cards = document.querySelectorAll(sel); if (cards.length > 2 && cards.length < 50) { cards.forEach((card, i) => { if (i < 3) { // Sample first 3 const listing = { selector: sel, html: card.innerHTML.substring(0, 500), text: card.textContent.substring(0, 200).trim(), classes: card.className, // Try to find price priceElements: [], titleElements: [], locationElements: [] }; // Look for price (Β₯, ε…ƒ, number) card.querySelectorAll('*').forEach(el => { const text = el.textContent; if (text.match(/[Β₯οΏ₯]?\d+[ε…ƒ\/]/)) { listing.priceElements.push({ tag: el.tagName, class: el.className, text: text.substring(0, 50) }); } // Title usually in h2, h3, or has 'title' in class if (['H1', 'H2', 'H3', 'H4'].includes(el.tagName) || (el.className && el.className.includes('title'))) { listing.titleElements.push({ tag: el.tagName, class: el.className, text: text.substring(0, 100) }); } }); extractedListings.push(listing); } }); break; // Found good selector, stop } } catch (e) {} } info.allClasses = Array.from(info.allClasses); info.allIds = Array.from(info.allIds); info.extractedListings = extractedListings; return info; }); // Save page info const infoPath = `./xiaozhu_pageinfo_${Date.now()}.json`; fs.writeFileSync(infoPath, JSON.stringify(pageInfo, null, 2)); console.log(`πŸ’Ύ Page info saved: ${infoPath}`); // Print summary console.log('\nπŸ“Š PAGE ANALYSIS:'); console.log(` Title: ${pageInfo.title}`); console.log(` URL: ${pageInfo.url}`); console.log(` Total classes found: ${pageInfo.allClasses.length}`); console.log(` Total IDs found: ${pageInfo.allIds.length}`); if (pageInfo.possibleContainers.length > 0) { console.log('\n🎯 POSSIBLE LISTING CONTAINERS:'); pageInfo.possibleContainers.slice(0, 5).forEach(c => { console.log(` - ${c.selector} (${c.count} elements)`); }); } if (pageInfo.extractedListings.length > 0) { console.log('\nπŸ“ SAMPLE LISTINGS EXTRACTED:'); pageInfo.extractedListings.forEach((l, i) => { console.log(`\n Listing ${i + 1} (selector: ${l.selector}):`); if (l.titleElements.length > 0) { console.log(` Title: ${l.titleElements[0].text}`); } if (l.priceElements.length > 0) { console.log(` Price: ${l.priceElements[0].text}`); } }); } console.log('\nβœ… Successfully inspected this URL!'); console.log('πŸ“Έ Check the screenshot and JSON file for details'); // Found a working URL, no need to try others await browser.close(); return; } catch (err) { console.log(`❌ Failed to load: ${err.message}`); continue; // Try next URL } } console.log('\n❌ All URLs failed. Site might be blocking automated access.'); await browser.close(); } inspectXiaozhu().catch(console.error);