const puppeteer = require('puppeteer'); const fs = require('fs'); /** * Xiaozhu URL Extractor - Click listings to get real URLs * Workaround for Vue.js router-link navigation */ const CONFIG = { searchQuery: 'δΊ€ι€šε€§ε­¦', latitude: 31.1880, longitude: 121.4367, maxListings: 10 }; console.log('πŸ”— Xiaozhu URL Extractor'); console.log('🎯 Extracting real URLs by simulating clicks...\n'); async function extractURLs() { const browser = await puppeteer.launch({ headless: "new", defaultViewport: { width: 414, height: 896 }, args: ['--no-sandbox', '--disable-setuid-sandbox'] }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15'); const context = browser.defaultBrowserContext(); await context.overridePermissions('https://minsu.xiaozhu.com', ['geolocation']); await page.setGeolocation({ latitude: CONFIG.latitude, longitude: CONFIG.longitude, accuracy: 100 }); try { console.log('🌐 Loading Xiaozhu...'); await page.goto('https://minsu.xiaozhu.com/', { waitUntil: 'networkidle2', timeout: 30000 }); await new Promise(resolve => setTimeout(resolve, 3000)); console.log('πŸ“Έ Taking homepage screenshot...'); await page.screenshot({ path: './xiaozhu_url_extract_home.png' }); // Search const searchInput = await page.$('input[type="text"]'); if (searchInput) { console.log('⌨️ Found search input, typing...'); await searchInput.click(); await new Promise(resolve => setTimeout(resolve, 500)); // Clear any pre-filled text await page.keyboard.down('Control'); await page.keyboard.press('A'); await page.keyboard.up('Control'); await page.keyboard.press('Backspace'); await searchInput.type(CONFIG.searchQuery, { delay: 150 }); console.log(` Typed: ${CONFIG.searchQuery}`); await new Promise(resolve => setTimeout(resolve, 2000)); await page.screenshot({ path: './xiaozhu_url_extract_typed.png' }); // Look for suggestion with keyword const clicked = await page.evaluate((keyword) => { const elements = document.querySelectorAll('div, li, a, span'); const matches = []; for (const el of elements) { const text = el.textContent.trim(); const rect = el.getBoundingClientRect(); if (rect.width > 0 && rect.height > 0 && text.includes(keyword)) { matches.push({ el, text, score: text === keyword ? 100 : 50 }); } } matches.sort((a, b) => b.score - a.score); if (matches.length > 0) { console.log(`Clicking suggestion: "${matches[0].text}"`); matches[0].el.click(); return true; } return false; }, CONFIG.searchQuery); if (clicked) { console.log('βœ… Clicked suggestion\n'); await new Promise(resolve => setTimeout(resolve, 4000)); } else { console.log('⚠️ No suggestion, pressing Enter\n'); await page.keyboard.press('Enter'); await new Promise(resolve => setTimeout(resolve, 3000)); } } await page.screenshot({ path: './xiaozhu_url_extract_search.png' }); // Scroll to load all console.log('⏬ Scrolling to load listings...'); await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await new Promise(resolve => setTimeout(resolve, 5000)); await page.screenshot({ path: './xiaozhu_url_extract_scrolled.png' }); // Get all listing elements let listingElements = await page.$$('.list-item'); console.log(`πŸ“Š Found ${listingElements.length} listings\n`); if (listingElements.length === 0) { console.log('⚠️ No .list-item found, trying alternative selectors...'); const altSelectors = [ '[class*="list-item"]', '[class*="house"]', '[class*="room"]' ]; for (const sel of altSelectors) { listingElements = await page.$$(sel); if (listingElements.length > 0) { console.log(` βœ… Found ${listingElements.length} with selector: ${sel}\n`); break; } } } if (listingElements.length === 0) { console.log('❌ No listings found. Check screenshots for debugging.'); await browser.close(); return; } const results = []; for (let i = 0; i < Math.min(listingElements.length, CONFIG.maxListings); i++) { console.log(`πŸ” ${i + 1}/${Math.min(listingElements.length, CONFIG.maxListings)}...`); try { // Re-query elements each time (they become stale after navigation) const currentElements = await page.$$('.list-item, [class*="list-item"]'); if (i >= currentElements.length) { console.log(' ⚠️ Element no longer available'); continue; } // Get listing info before click const listingInfo = await page.evaluate((index) => { const items = document.querySelectorAll('.list-item, [class*="list-item"]'); const item = items[index]; if (!item) return null; const titleEl = item.querySelector('.list-title, [class*="title"]'); const priceEl = item.querySelector('.list-price, [class*="price"]'); const imgEl = item.querySelector('img'); return { title: titleEl?.textContent.trim() || 'No title', price: priceEl?.textContent.trim() || 'No price', image: imgEl?.src || null }; }, i); if (!listingInfo) { console.log(' ⚠️ Could not extract info'); continue; } console.log(` πŸ“ ${listingInfo.title.substring(0, 50)}...`); console.log(` πŸ’° ${listingInfo.price}`); // Click the listing await currentElements[i].click(); console.log(` πŸ‘† Clicked`); // Wait for navigation await new Promise(resolve => setTimeout(resolve, 4000)); // Get current URL const url = page.url(); console.log(` πŸ”— ${url}`); results.push({ ...listingInfo, url: url }); // Go back await page.goBack({ waitUntil: 'networkidle2', timeout: 10000 }); console.log(` ⬅️ Back to list`); await new Promise(resolve => setTimeout(resolve, 2000)); console.log(''); } catch (err) { console.log(` ❌ Error: ${err.message}`); // Try to recover by going back try { await page.goBack(); await new Promise(resolve => setTimeout(resolve, 2000)); } catch (e) { console.log(' ⚠️ Could not go back, continuing...'); } } } // Save results const outputFile = './xiaozhu_urls.json'; fs.writeFileSync(outputFile, JSON.stringify(results, null, 2)); console.log(`\nπŸ’Ύ Saved ${results.length} URLs to ${outputFile}`); // Print results console.log('\nπŸ“‹ EXTRACTED URLS:\n'); results.forEach((r, i) => { console.log(`${i + 1}. ${r.title?.substring(0, 60)}`); console.log(` πŸ’° ${r.price}`); console.log(` πŸ”— ${r.url}\n`); }); } catch (err) { console.error('❌ Error:', err.message); } finally { await browser.close(); } } extractURLs().catch(console.error);