const puppeteer = require('puppeteer'); const fs = require('fs'); /** * Xiaozhu Interactive Scraper - Full navigation simulation * Simulates real user behavior to navigate and extract listings */ const CONFIG = { // Search criteria city: '上海', district: '徐汇区', keyword: '交通大学', // Dates checkIn: '2025-12-24', checkOut: '2026-01-22', days: 29, // Budget (RMB) budgetIdeal: 4000, budgetMax: 5000, get dailyBudgetIdeal() { return Math.ceil(this.budgetIdeal / 30 * this.days); }, get dailyBudgetMax() { return Math.ceil(this.budgetMax / 30 * this.days); }, // Equipment required: ['厨房', '冰箱'], bonus: ['洗衣机', '地铁'], // Scraping config maxScrolls: 10, scrollDelay: 2000, interactionDelay: 1000, // Output outputFile: './xiaozhu_results.json', outputMarkdown: './xiaozhu_results.md', topN: 20, // Debug headless: true, screenshots: true }; console.log('🚀 Xiaozhu Interactive Scraper'); console.log(`📍 Target: ${CONFIG.city} ${CONFIG.district}`); console.log(`📅 Dates: ${CONFIG.checkIn} → ${CONFIG.checkOut} (${CONFIG.days} days)`); console.log(`💰 Budget: ${CONFIG.budgetIdeal}-${CONFIG.budgetMax} RMB/month (${CONFIG.dailyBudgetIdeal}-${CONFIG.dailyBudgetMax} RMB total)\n`); async function wait(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async function screenshot(page, name) { if (CONFIG.screenshots) { const filename = `./xiaozhu_${name}_${Date.now()}.png`; await page.screenshot({ path: filename, fullPage: true }); console.log(`📸 Screenshot: ${filename}`); } } async function loadCookies() { try { const cookies = fs.readFileSync('./xiaozhu_cookies.json', 'utf8'); return JSON.parse(cookies); } catch (err) { console.log('⚠️ No cookies found (optional)'); return null; } } async function scrapXiaozhu() { const cookies = await loadCookies(); const browser = await puppeteer.launch({ headless: CONFIG.headless ? "new" : false, defaultViewport: { width: 414, height: 896 }, // Mobile viewport (Xiaozhu is mobile-first) args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage' ] }); const page = await browser.newPage(); // Mobile user agent await page.setUserAgent('Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1'); // Load cookies if available if (cookies && cookies.length > 0) { try { await page.setCookie(...cookies); console.log(`🍪 Loaded ${cookies.length} cookies\n`); } catch (err) { console.log('⚠️ Could not load cookies:', err.message); } } try { console.log('🌐 Loading homepage...'); await page.goto('https://minsu.xiaozhu.com/', { waitUntil: 'networkidle2', timeout: 30000 }); await wait(3000); // Wait for Vue app to initialize await screenshot(page, 'homepage'); console.log('✅ Homepage loaded\n'); // Strategy 1: Look for search input console.log('🔍 Looking for search input...'); const searchSelectors = [ 'input[placeholder*="目的地"]', 'input[placeholder*="搜索"]', 'input[placeholder*="城市"]', '.search-input', '.van-search__content input', 'input[type="search"]', 'input[type="text"]' ]; let searchInput = null; let inputSelector = null; for (const selector of searchSelectors) { try { const element = await page.$(selector); if (element) { const isVisible = await page.evaluate(el => { const rect = el.getBoundingClientRect(); return rect.width > 0 && rect.height > 0; }, element); if (isVisible) { searchInput = element; inputSelector = selector; console.log(`✅ Found search input: ${selector}`); break; } } } catch (e) {} } if (searchInput) { console.log('⌨️ Entering search query...'); // Click to focus await searchInput.click(); await wait(500); // Type search query - try just city first await searchInput.type(`${CONFIG.city}`, { delay: 150 }); await wait(CONFIG.interactionDelay * 2); // Wait longer for suggestions to load await screenshot(page, 'search_typed'); // Look for search suggestions or submit button console.log('👆 Looking for search button or suggestions...'); const submitSelectors = [ 'button[type="submit"]', '.search-button', '.van-button--primary', 'button.submit', '.search-btn' ]; let submitted = false; // Try to click suggestions first await wait(1500); // Look for suggestions containing Shanghai const shanghaiClicked = await page.evaluate((city) => { const suggestions = document.querySelectorAll('.van-cell, .suggestion-item, [class*="suggest"], .city-item, div[class*="item"]'); for (const sugg of suggestions) { if (sugg.textContent.includes(city)) { sugg.click(); return true; } } return false; }, CONFIG.city); if (shanghaiClicked) { console.log(` Clicked suggestion containing ${CONFIG.city}`); submitted = true; await wait(4000); } else { console.log(` No ${CONFIG.city} suggestion found, trying all suggestions...`); const suggestions = await page.$$('.van-cell, .suggestion-item, [class*="suggest"]'); if (suggestions.length > 0) { console.log(` Found ${suggestions.length} suggestions, clicking first...`); await suggestions[0].click(); submitted = true; await wait(3000); } } // If no suggestions, try submit button if (!submitted) { for (const selector of submitSelectors) { try { const button = await page.$(selector); if (button) { console.log(` Clicking submit: ${selector}`); await button.click(); submitted = true; await wait(3000); break; } } catch (e) {} } } // If still not submitted, try pressing Enter if (!submitted) { console.log(' Pressing Enter...'); await page.keyboard.press('Enter'); await wait(3000); } await screenshot(page, 'after_search'); } else { // Strategy 2: Look for city/location selector console.log('❌ No search input found'); console.log('🔍 Looking for city selector...'); const citySelectors = [ 'a:contains("上海")', 'div:contains("上海")', '.city-item', '[data-city="shanghai"]' ]; // Try to find and click Shanghai const cityFound = await page.evaluate((city) => { const elements = Array.from(document.querySelectorAll('a, div, span')); const shanghaEl = elements.find(el => el.textContent.trim() === city && el.getBoundingClientRect().width > 0 ); if (shanghaEl) { shanghaEl.click(); return true; } return false; }, CONFIG.city); if (cityFound) { console.log('✅ Clicked Shanghai'); await wait(3000); } else { console.log('⚠️ Could not find city selector'); } } // Current URL after navigation let currentUrl = page.url(); console.log(`\n📍 Current URL: ${currentUrl}`); // If we're on /suggest page, try to find and click Shanghai if (currentUrl.includes('/suggest')) { console.log('⚠️ On suggestions page, looking for Shanghai option...\n'); const shanghaiFound = await page.evaluate((city) => { // Look for Shanghai in hot recommendations or administrative areas const items = document.querySelectorAll('.city-hot-item, .city-hot-item2, .city-item, div[class*="item"]'); for (const item of items) { const text = item.textContent.trim(); if (text === city || text.includes(city)) { console.log(`Found ${city} option: ${text}`); item.click(); return true; } } return false; }, CONFIG.city); if (shanghaiFound) { console.log(` ✅ Clicked ${CONFIG.city} from suggestions`); await wait(4000); currentUrl = page.url(); console.log(` 📍 New URL: ${currentUrl}`); } else { console.log(` ❌ ${CONFIG.city} not found in suggestions`); console.log(` 💡 Try searching for just the city name next time\n`); } } await screenshot(page, 'before_extraction'); console.log(''); // Extract listings console.log('📊 Extracting listings...\n'); let allListings = []; let previousCount = 0; let noChangeCount = 0; // Scroll to load more listings (lazy loading) for (let i = 0; i < CONFIG.maxScrolls; i++) { console.log(`🔄 Scroll ${i + 1}/${CONFIG.maxScrolls}...`); // Extract current listings const listings = await page.evaluate(() => { const results = []; // Possible selectors for listing cards const selectors = [ '.house-item', '.room-item', '.van-card', '[class*="house"]', '[class*="room"]', '[class*="card"]' ]; let items = []; for (const sel of selectors) { const elements = document.querySelectorAll(sel); if (elements.length > items.length) { items = Array.from(elements); } } items.forEach((item, idx) => { try { const listing = { index: idx + 1, html: item.innerHTML.substring(0, 500), text: item.textContent.trim().substring(0, 300) }; // Extract title const titleEl = item.querySelector('h2, h3, h4, .title, .name, .van-card__title, [class*="title"]'); if (titleEl) { listing.title = titleEl.textContent.trim(); } // Extract price const pricePatterns = [ '.price', '.van-card__price', '[class*="price"]', 'span:contains("¥")', 'span:contains("元")' ]; for (const pattern of pricePatterns) { const priceEl = item.querySelector(pattern); if (priceEl) { const priceText = priceEl.textContent; const match = priceText.match(/(\d+)/); if (match) { listing.priceDaily = parseInt(match[1]); listing.priceText = priceText.trim(); break; } } } // If no price found, search in all text if (!listing.priceDaily) { const priceMatch = item.textContent.match(/[¥¥]?\s*(\d+)\s*[元\/]/); if (priceMatch) { listing.priceDaily = parseInt(priceMatch[1]); } } // Extract location const locationEl = item.querySelector('.location, .address, .area, [class*="location"]'); if (locationEl) { listing.location = locationEl.textContent.trim(); } // Extract URL const linkEl = item.querySelector('a'); if (linkEl) { listing.url = linkEl.href; } // Extract image const imgEl = item.querySelector('img'); if (imgEl) { listing.image = imgEl.src; } // Check equipment mentions in text const fullText = item.textContent.toLowerCase(); listing.hasKitchen = fullText.includes('厨房') || fullText.includes('kitchen'); listing.hasFridge = fullText.includes('冰箱') || fullText.includes('fridge'); listing.hasWashingMachine = fullText.includes('洗衣机') || fullText.includes('washing'); listing.hasMetro = fullText.includes('地铁') || fullText.includes('metro') || fullText.includes('站'); results.push(listing); } catch (e) { console.error('Error extracting listing:', e); } }); return results; }); // Merge with previous (avoid duplicates by URL) const newListings = listings.filter(l => !allListings.some(existing => existing.url === l.url && l.url) ); allListings = [...allListings, ...newListings]; console.log(` Found ${listings.length} on page, ${newListings.length} new, ${allListings.length} total`); // Check if we got new listings if (allListings.length === previousCount) { noChangeCount++; if (noChangeCount >= 3) { console.log(' No new listings for 3 scrolls, stopping...'); break; } } else { noChangeCount = 0; } previousCount = allListings.length; // Scroll down await page.evaluate(() => { window.scrollBy(0, window.innerHeight); }); await wait(CONFIG.scrollDelay); } await screenshot(page, 'final'); console.log(`\n✅ Total extracted: ${allListings.length} listings\n`); if (allListings.length === 0) { console.log('❌ No listings found!'); console.log('💾 Saving page HTML for inspection...'); const html = await page.content(); fs.writeFileSync('./xiaozhu_interactive_page.html', html); console.log('\n📋 Page info:'); const pageInfo = await page.evaluate(() => ({ url: window.location.href, title: document.title, bodyText: document.body.textContent.substring(0, 500), elementCount: document.querySelectorAll('*').length })); console.log(` URL: ${pageInfo.url}`); console.log(` Title: ${pageInfo.title}`); console.log(` Elements: ${pageInfo.elementCount}`); console.log(` Body preview: ${pageInfo.bodyText.substring(0, 200)}...`); } else { // Process and filter listings const processed = processListings(allListings); // Save results fs.writeFileSync(CONFIG.outputFile, JSON.stringify(processed, null, 2)); console.log(`💾 Results saved: ${CONFIG.outputFile}`); const markdown = generateMarkdown(processed); fs.writeFileSync(CONFIG.outputMarkdown, markdown); console.log(`📝 Markdown saved: ${CONFIG.outputMarkdown}`); // Print top results printTopResults(processed); } } catch (err) { console.error('❌ Error:', err.message); console.error(err.stack); await screenshot(page, 'error'); } finally { if (CONFIG.headless) { await browser.close(); } else { console.log('\n⏸️ Browser kept open for inspection. Close manually when done.'); } } } function processListings(listings) { return listings .filter(l => l.priceDaily && l.priceDaily > 0) .map(l => { // Calculate total price l.priceTotal = l.priceDaily * CONFIG.days; l.priceMonthly = Math.ceil(l.priceDaily * 30); // Score let score = 0; // Price scoring if (l.priceTotal <= CONFIG.dailyBudgetIdeal) { score += (CONFIG.dailyBudgetIdeal - l.priceTotal) / 100; } else if (l.priceTotal <= CONFIG.dailyBudgetMax) { score -= (l.priceTotal - CONFIG.dailyBudgetIdeal) / 50; } else { score -= 100; } // Equipment bonuses if (l.hasKitchen) score += 20; if (l.hasFridge) score += 15; if (l.hasWashingMachine) score += 10; if (l.hasMetro) score += 15; // Location bonus if (l.location) { if (l.location.includes(CONFIG.district)) score += 20; if (l.location.includes(CONFIG.keyword)) score += 10; } if (l.title) { if (l.title.includes(CONFIG.keyword)) score += 10; } l.score = Math.round(score * 10) / 10; return l; }) .filter(l => l.hasKitchen && l.hasFridge) // Required .filter(l => l.priceTotal <= CONFIG.dailyBudgetMax) // Budget .sort((a, b) => b.score - a.score) .slice(0, CONFIG.topN); } function generateMarkdown(listings) { let md = '# Xiaozhu Search Results - Interactive Scraper\n\n'; md += `**Date:** ${new Date().toLocaleDateString()}\n`; md += `**Location:** ${CONFIG.city} ${CONFIG.district}\n`; md += `**Dates:** ${CONFIG.checkIn} → ${CONFIG.checkOut} (${CONFIG.days} days)\n`; md += `**Budget:** ${CONFIG.budgetIdeal}-${CONFIG.budgetMax} RMB/month\n\n`; md += '| # | Title | Daily | Total | Kitchen | Fridge | Washer | Metro | Score | Link |\n'; md += '|---|-------|-------|-------|---------|--------|--------|-------|-------|------|\n'; listings.forEach((l, i) => { md += `| ${i + 1} `; md += `| ${(l.title || 'Untitled').substring(0, 40)} `; md += `| ¥${l.priceDaily} `; md += `| ¥${l.priceTotal} `; md += `| ${l.hasKitchen ? '✓' : '✗'} `; md += `| ${l.hasFridge ? '✓' : '✗'} `; md += `| ${l.hasWashingMachine ? '✓' : '✗'} `; md += `| ${l.hasMetro ? '✓' : '✗'} `; md += `| ${l.score} `; md += `| ${l.url ? `[View](${l.url})` : '-'} |\n`; }); return md; } function printTopResults(listings) { console.log('\n🏆 TOP RESULTS:\n'); listings.slice(0, 5).forEach((l, i) => { console.log(`${i + 1}. ${l.title || 'Untitled'}`); console.log(` 💰 ¥${l.priceDaily}/day × ${CONFIG.days} days = ¥${l.priceTotal} total (~¥${l.priceMonthly}/month)`); if (l.location) console.log(` 📍 ${l.location}`); console.log(` ✓ Kitchen: ${l.hasKitchen ? '✓' : '✗'} | Fridge: ${l.hasFridge ? '✓' : '✗'} | Washer: ${l.hasWashingMachine ? '✓' : '✗'} | Metro: ${l.hasMetro ? '✓' : '✗'}`); console.log(` ⭐ Score: ${l.score}`); if (l.url) console.log(` 🔗 ${l.url}`); console.log(''); }); } // Run scrapXiaozhu().catch(console.error);