const puppeteer = require('puppeteer'); const fs = require('fs'); /** * Xiaozhu Fixed Scraper - With geolocation override and smart navigation * Fixes: Geolocation to Shanghai, better suggestion detection, city verification */ const CONFIG = { // Location - Search specifically for Jiaotong University area city: '上海', searchQuery: '交通大学', // Just the university name for better suggestions cityEnglish: 'shanghai', district: '徐汇区', keyword: '交通大学', // Shanghai Xujiahui coordinates latitude: 31.1880, longitude: 121.4367, // Dates checkIn: '2025-12-24', checkOut: '2026-01-22', days: 29, // Budget budgetIdeal: 4000, budgetMax: 5000, get dailyBudgetIdeal() { return Math.ceil(this.budgetIdeal / 30 * this.days); }, get dailyBudgetMax() { return Math.ceil(this.budgetMax / 30 * this.days); }, // Equipment required: ['厨房', '冰箱'], bonus: ['洗衣机', '地铁'], // Scraping - More aggressive to load everything maxScrolls: 50, scrollDelay: 3500, // Longer wait for lazy load interactionDelay: 1000, noChangeThreshold: 7, // Wait 7 scrolls without change before stopping // Output outputFile: './xiaozhu_results.json', outputMarkdown: './xiaozhu_results.md', topN: 20, // Debug headless: true, screenshots: true }; console.log('🚀 Xiaozhu FIXED Scraper - Jiaotong University Focus'); console.log(`📍 Search: ${CONFIG.searchQuery}`); console.log(`🎯 Target: ${CONFIG.keyword} (${CONFIG.district})`); console.log(`🌍 Geolocation: ${CONFIG.latitude}, ${CONFIG.longitude}`); console.log(`📅 Dates: ${CONFIG.checkIn} → ${CONFIG.checkOut} (${CONFIG.days} days)`); console.log(`💰 Budget: ${CONFIG.budgetIdeal}-${CONFIG.budgetMax} RMB/month\n`); async function wait(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async function screenshot(page, name) { if (CONFIG.screenshots) { const filename = `./xiaozhu_${name}_${Date.now()}.png`; await page.screenshot({ path: filename, fullPage: true }); console.log(`📸 ${filename}`); } } async function loadCookies() { try { const cookies = fs.readFileSync('./xiaozhu_cookies.json', 'utf8'); return JSON.parse(cookies); } catch (err) { return null; } } async function scrapXiaozhu() { const cookies = await loadCookies(); const browser = await puppeteer.launch({ headless: CONFIG.headless ? "new" : false, defaultViewport: { width: 414, height: 896 }, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage' ] }); const page = await browser.newPage(); // Mobile user agent await page.setUserAgent('Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1'); // ===== FIX 1: Override geolocation to Shanghai ===== console.log('🌍 Setting geolocation to Shanghai Xujiahui...'); const context = browser.defaultBrowserContext(); await context.overridePermissions('https://minsu.xiaozhu.com', ['geolocation']); await page.setGeolocation({ latitude: CONFIG.latitude, longitude: CONFIG.longitude, accuracy: 100 }); console.log(`✅ Geolocation set to ${CONFIG.latitude}, ${CONFIG.longitude}\n`); // Load cookies if (cookies && cookies.length > 0) { try { await page.setCookie(...cookies); console.log(`🍪 Loaded ${cookies.length} cookies\n`); } catch (err) { console.log('⚠️ Cookie error:', err.message); } } try { // ===== FIX 2: Try direct URL first ===== console.log('🔍 Strategy 1: Trying direct Shanghai URL...'); const directUrls = [ `https://minsu.xiaozhu.com/${CONFIG.cityEnglish}`, `https://minsu.xiaozhu.com/city/${CONFIG.cityEnglish}`, `https://minsu.xiaozhu.com/search/${CONFIG.cityEnglish}`, `https://minsu.xiaozhu.com/shanghai/${CONFIG.district}` ]; let successUrl = null; for (const url of directUrls) { try { console.log(` Trying: ${url}`); await page.goto(url, { waitUntil: 'networkidle2', timeout: 15000 }); await wait(2000); const is404 = await page.evaluate(() => { return document.body.textContent.includes('404') || document.body.textContent.includes('找不到'); }); if (!is404) { console.log(` ✅ Success!`); successUrl = url; await screenshot(page, 'direct_url_success'); break; } else { console.log(` ❌ 404`); } } catch (e) { console.log(` ❌ Failed: ${e.message}`); } } // If direct URL failed, use homepage search if (!successUrl) { console.log('\n🔍 Strategy 2: Homepage search with geolocation...'); await page.goto('https://minsu.xiaozhu.com/', { waitUntil: 'networkidle2', timeout: 30000 }); await wait(3000); await screenshot(page, 'homepage'); // Check if geolocation worked and we see Shanghai content const cityDetected = await page.evaluate(() => { const bodyText = document.body.textContent; if (bodyText.includes('上海') || bodyText.includes('Shanghai')) { return '上海'; } else if (bodyText.includes('北京') || bodyText.includes('Beijing')) { return '北京'; } return 'unknown'; }); console.log(` Detected city: ${cityDetected}`); // Find search input console.log('\n⌨️ Using search...'); const searchSelectors = [ 'input[placeholder*="目的地"]', 'input[placeholder*="搜索"]', 'input[type="search"]', 'input[type="text"]' ]; let searchInput = null; for (const selector of searchSelectors) { searchInput = await page.$(selector); if (searchInput) { console.log(` Found: ${selector}`); break; } } if (searchInput) { await searchInput.click(); await wait(500); // ===== FIX 3: Clear any pre-filled text first ===== await page.keyboard.down('Control'); await page.keyboard.press('A'); await page.keyboard.up('Control'); await page.keyboard.press('Backspace'); // Type specific search query for Jiaotong University await searchInput.type(CONFIG.searchQuery, { delay: 150 }); await wait(2000); // Wait for suggestions await screenshot(page, 'search_typed'); // ===== FIX 4: Smart suggestion detection ===== console.log(`\n👆 Looking for suggestions matching "${CONFIG.searchQuery}"...`); const shanghaiClicked = await page.evaluate((searchQuery, keyword) => { // Look for suggestions containing our keyword (交通大学) const allElements = document.querySelectorAll('div, li, a, span'); const matchingElements = []; for (const el of allElements) { const text = el.textContent.trim(); const rect = el.getBoundingClientRect(); // Must be visible if (rect.width > 0 && rect.height > 0) { // Prioritize exact keyword match if (text.includes(keyword)) { matchingElements.push({ el, text, score: 100 }); } // Or search query match else if (text.includes(searchQuery)) { matchingElements.push({ el, text, score: 80 }); } // Or contains Shanghai else if (text.includes('上海') && text.length < 15) { matchingElements.push({ el, text, score: 30 }); } } } // Sort by score and click best match matchingElements.sort((a, b) => b.score - a.score); if (matchingElements.length > 0) { console.log(`Found ${matchingElements.length} matching elements, clicking best: "${matchingElements[0].text}"`); matchingElements[0].el.click(); return true; } return false; }, CONFIG.searchQuery, CONFIG.keyword); if (shanghaiClicked) { console.log(' ✅ Clicked matching suggestion'); await wait(4000); await screenshot(page, 'after_suggestion'); } else { console.log(' ⚠️ No matching suggestion, pressing Enter...'); await page.keyboard.press('Enter'); await wait(3000); } } } // ===== FIX 5: Verify we're on Shanghai, if not, fix it ===== let currentUrl = page.url(); console.log(`\n📍 Current URL: ${currentUrl}`); const cityCheck = await page.evaluate(() => { const text = document.body.textContent; return { hasShanghai: text.includes('上海') || text.includes('Shanghai'), hasBeijing: text.includes('北京') || text.includes('Beijing') || text.includes('天安门') || text.includes('朝阳'), bodyPreview: text.substring(0, 300) }; }); console.log(` Shanghai content: ${cityCheck.hasShanghai ? '✅' : '❌'}`); console.log(` Beijing content: ${cityCheck.hasBeijing ? '⚠️ YES' : '✅ No'}`); if (cityCheck.hasBeijing && !cityCheck.hasShanghai) { console.log('\n🔧 Detected Beijing, attempting to switch to Shanghai...'); // Try to find Shanghai in the page const switched = await page.evaluate((city) => { // Look for any clickable Shanghai element const elements = Array.from(document.querySelectorAll('a, div, span, button')); for (const el of elements) { const text = el.textContent.trim(); if ((text === city || text === city + '市') && el.getBoundingClientRect().width > 0) { console.log(`Clicking: "${text}"`); el.click(); return true; } } // Try searching in a visible input const inputs = document.querySelectorAll('input[type="text"], input[type="search"]'); for (const input of inputs) { if (input.getBoundingClientRect().width > 0) { input.value = city; input.dispatchEvent(new Event('input', { bubbles: true })); // Try to submit const form = input.closest('form'); if (form) { form.dispatchEvent(new Event('submit', { bubbles: true })); return true; } // Or press Enter const enterEvent = new KeyboardEvent('keydown', { key: 'Enter', code: 'Enter', keyCode: 13, bubbles: true }); input.dispatchEvent(enterEvent); return true; } } return false; }, CONFIG.city); if (switched) { console.log(' ✅ Triggered Shanghai switch'); await wait(4000); await screenshot(page, 'after_switch'); currentUrl = page.url(); console.log(` 📍 New URL: ${currentUrl}`); } else { console.log(' ❌ Could not find Shanghai option'); } } // Extract listings console.log('\n📊 Extracting listings...\n'); let allListings = []; let previousCount = 0; let noChangeCount = 0; // FIRST: Scroll to bottom to trigger all lazy loading at once console.log('⏬ Scrolling to page bottom to trigger lazy load...'); await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight); }); await wait(5000); // Wait for initial load // SECOND: Progressive scrolling to load more console.log(`⏳ Progressive scrolling (max ${CONFIG.maxScrolls} scrolls, ${CONFIG.noChangeThreshold} patience)...\n`); for (let i = 0; i < CONFIG.maxScrolls; i++) { const progress = Math.round((i / CONFIG.maxScrolls) * 100); console.log(`🔄 Scroll ${i + 1}/${CONFIG.maxScrolls} (${progress}%)...`); const listings = await page.evaluate(() => { const results = []; const selectors = [ '.list-item', // PRIMARY - Found in HTML analysis '.house-item', '.room-item', '.van-card', '[class*="list-item"]', '[class*="house"]', '[class*="room"]' ]; let items = []; for (const sel of selectors) { const elements = document.querySelectorAll(sel); if (elements.length > 0 && elements.length < 200) { items = Array.from(elements); console.log(`Using selector: ${sel} (${elements.length} items)`); break; // Use first working selector } } items.forEach((item, idx) => { if (idx >= 50) return; const listing = { index: idx + 1 }; // Debug: log all attributes of first item if (idx === 0) { console.log('DEBUG First item attributes:', { className: item.className, id: item.id, attributes: Array.from(item.attributes || []).map(a => `${a.name}=${a.value}`), innerHTML: item.innerHTML.substring(0, 200) }); } // Title - Try specific Xiaozhu classes first const titleEl = item.querySelector('.list-title, h2, h3, h4, .title, .name, [class*="title"]'); if (titleEl) listing.title = titleEl.textContent.trim(); // Price - Try specific Xiaozhu classes first const priceEl = item.querySelector('.list-price, .price-left, .price, [class*="price"]'); if (priceEl) { const match = priceEl.textContent.match(/(\d+)/); if (match) { listing.priceDaily = parseInt(match[1]); listing.priceText = priceEl.textContent.trim(); } } if (!listing.priceDaily) { const match = item.textContent.match(/[¥¥]?\s*(\d+)\s*[元\/晚]/); if (match) listing.priceDaily = parseInt(match[1]); } // Location - Extract from content or title const contentEl = item.querySelector('.list-content, .content, .location, .address'); if (contentEl) listing.location = contentEl.textContent.trim(); // Also check title for location keywords if (!listing.location && listing.title) { listing.location = listing.title; } // URL - Try multiple approaches // 1. Direct link const linkEl = item.querySelector('a'); if (linkEl && linkEl.href && linkEl.href !== 'javascript:;') { listing.url = linkEl.href; } // 2. Data attributes (房源ID / listing ID) if (!listing.url) { const dataId = item.getAttribute('data-id') || item.getAttribute('data-house-id') || item.getAttribute('data-fid'); if (dataId) { listing.url = `https://minsu.xiaozhu.com/house/${dataId}`; listing.houseId = dataId; } } // 3. Look for ID in onclick or other attributes if (!listing.url) { const onclick = item.getAttribute('onclick') || item.getAttribute('@click'); if (onclick) { const idMatch = onclick.match(/\d{6,}/); if (idMatch) { listing.url = `https://minsu.xiaozhu.com/house/${idMatch[0]}`; listing.houseId = idMatch[0]; } } } // 4. Check child elements for router-link if (!listing.url) { const routerLink = item.querySelector('[to], [router-link]'); if (routerLink) { const to = routerLink.getAttribute('to') || routerLink.getAttribute('router-link'); if (to) { listing.url = `https://minsu.xiaozhu.com${to}`; } } } // Image const imgEl = item.querySelector('img'); if (imgEl) listing.image = imgEl.src; // Equipment (check Chinese text, not lowercased) const fullText = item.textContent; listing.hasKitchen = fullText.includes('厨房') || fullText.includes('可做饭') || fullText.includes('可烧饭'); listing.hasFridge = fullText.includes('冰箱') || fullText.includes('冷藏'); listing.hasWashingMachine = fullText.includes('洗衣机'); listing.hasMetro = fullText.includes('地铁') || fullText.includes('站'); if (listing.title || listing.priceDaily) { results.push(listing); } }); return results; }); // Better duplicate detection (URL or title+price) const newListings = listings.filter(l => { const isDuplicate = allListings.some(existing => { // By URL if available if (l.url && existing.url && l.url === existing.url) return true; // By title + price combination if (l.title && existing.title && l.priceDaily && existing.priceDaily) { return l.title === existing.title && l.priceDaily === existing.priceDaily; } return false; }); return !isDuplicate; }); allListings = [...allListings, ...newListings]; console.log(` Found ${listings.length} items, ${newListings.length} new, ${allListings.length} total`); if (allListings.length === previousCount) { noChangeCount++; if (noChangeCount >= CONFIG.noChangeThreshold) { console.log(` No new listings for ${CONFIG.noChangeThreshold} scrolls, stopping...`); break; } } else { noChangeCount = 0; } previousCount = allListings.length; // Scroll down await page.evaluate(() => window.scrollBy(0, window.innerHeight)); // Wait for loading indicators to disappear try { await page.waitForFunction(() => { // Check for common loading indicators const loadingEls = document.querySelectorAll('.loading, .spinner, [class*="loading"]'); return loadingEls.length === 0 || Array.from(loadingEls).every(el => el.style.display === 'none'); }, { timeout: 2000 }); } catch (e) { // No loading indicator found, that's fine } // Additional wait for lazy load await wait(CONFIG.scrollDelay); } await screenshot(page, 'final'); console.log(`\n✅ Total extracted: ${allListings.length} listings\n`); // Save raw listings for debug fs.writeFileSync('./xiaozhu_raw_listings.json', JSON.stringify(allListings, null, 2)); console.log('💾 Raw listings saved to xiaozhu_raw_listings.json\n'); if (allListings.length === 0) { console.log('❌ No listings found!'); const html = await page.content(); fs.writeFileSync('./xiaozhu_fixed_page.html', html); console.log('💾 Saved HTML to xiaozhu_fixed_page.html'); const pageInfo = await page.evaluate(() => ({ url: window.location.href, title: document.title, hasShanghai: document.body.textContent.includes('上海'), hasBeijing: document.body.textContent.includes('北京'), bodyPreview: document.body.textContent.substring(0, 500) })); console.log('\n📋 Page diagnosis:'); console.log(` URL: ${pageInfo.url}`); console.log(` Title: ${pageInfo.title}`); console.log(` Has Shanghai: ${pageInfo.hasShanghai ? '✅' : '❌'}`); console.log(` Has Beijing: ${pageInfo.hasBeijing ? '⚠️' : '✅'}`); console.log(` Preview: ${pageInfo.bodyPreview.substring(0, 200)}...`); } else { const processed = processListings(allListings); fs.writeFileSync(CONFIG.outputFile, JSON.stringify(processed, null, 2)); console.log(`💾 ${CONFIG.outputFile}`); const markdown = generateMarkdown(processed); fs.writeFileSync(CONFIG.outputMarkdown, markdown); console.log(`📝 ${CONFIG.outputMarkdown}`); printTopResults(processed); } } catch (err) { console.error('❌ Error:', err.message); await screenshot(page, 'error'); } finally { if (CONFIG.headless) { await browser.close(); } } } function processListings(listings) { return listings .filter(l => l.priceDaily && l.priceDaily > 0) .map(l => { l.priceTotal = l.priceDaily * CONFIG.days; l.priceMonthly = Math.ceil(l.priceDaily * 30); let score = 0; if (l.priceTotal <= CONFIG.dailyBudgetIdeal) { score += (CONFIG.dailyBudgetIdeal - l.priceTotal) / 100; } else if (l.priceTotal <= CONFIG.dailyBudgetMax) { score -= (l.priceTotal - CONFIG.dailyBudgetIdeal) / 50; } else { score -= 100; } if (l.hasKitchen) score += 20; if (l.hasFridge) score += 15; if (l.hasWashingMachine) score += 10; if (l.hasMetro) score += 15; if (l.location) { if (l.location.includes(CONFIG.district)) score += 20; if (l.location.includes(CONFIG.keyword)) score += 10; } if (l.title && l.title.includes(CONFIG.keyword)) score += 10; l.score = Math.round(score * 10) / 10; return l; }) // Relax filtering - show results even without kitchen/fridge detected // .filter(l => l.hasKitchen && l.hasFridge) // Too strict - equipment might be in icons .filter(l => l.priceTotal <= CONFIG.dailyBudgetMax * 1.2) // Allow 20% over budget .sort((a, b) => b.score - a.score) .slice(0, CONFIG.topN); } function generateMarkdown(listings) { let md = '# Xiaozhu Results - FIXED Scraper\n\n'; md += `**Date:** ${new Date().toLocaleDateString()}\n`; md += `**Location:** ${CONFIG.city} ${CONFIG.district}\n`; md += `**Dates:** ${CONFIG.checkIn} → ${CONFIG.checkOut} (${CONFIG.days} days)\n\n`; md += '| # | Title | Daily | Total | Kitchen | Fridge | Washer | Metro | Score | Link |\n'; md += '|---|-------|-------|-------|---------|--------|--------|-------|-------|------|\n'; listings.forEach((l, i) => { md += `| ${i + 1} `; md += `| ${(l.title || 'Untitled').substring(0, 40)} `; md += `| ¥${l.priceDaily} `; md += `| ¥${l.priceTotal} `; md += `| ${l.hasKitchen ? '✓' : '✗'} `; md += `| ${l.hasFridge ? '✓' : '✗'} `; md += `| ${l.hasWashingMachine ? '✓' : '✗'} `; md += `| ${l.hasMetro ? '✓' : '✗'} `; md += `| ${l.score} `; md += `| ${l.url ? `[View](${l.url})` : '-'} |\n`; }); return md; } function printTopResults(listings) { console.log('\n🏆 TOP RESULTS:\n'); listings.slice(0, 5).forEach((l, i) => { console.log(`${i + 1}. ${l.title || 'Untitled'}`); console.log(` 💰 ¥${l.priceDaily}/day × ${CONFIG.days} days = ¥${l.priceTotal}`); if (l.location) console.log(` 📍 ${l.location}`); console.log(` ✓ Kitchen: ${l.hasKitchen ? '✓' : '✗'} | Fridge: ${l.hasFridge ? '✓' : '✗'} | Washer: ${l.hasWashingMachine ? '✓' : '✗'} | Metro: ${l.hasMetro ? '✓' : '✗'}`); console.log(` ⭐ ${l.score}`); if (l.url) console.log(` 🔗 ${l.url}`); console.log(''); }); } scrapXiaozhu().catch(console.error);