Major updates: - December 2025 crisis documentation and separation agreement - Daily check system v2 with multiple card categories - Xiaozhu rental search tools and results - Exit plan documentation - Message drafts for family communication - Confluent moved to CONSTANT - Updated profiles and promises 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
582 lines
18 KiB
JavaScript
582 lines
18 KiB
JavaScript
const puppeteer = require('puppeteer');
|
||
const fs = require('fs');
|
||
|
||
/**
|
||
* Xiaozhu Interactive Scraper - Full navigation simulation
|
||
* Simulates real user behavior to navigate and extract listings
|
||
*/
|
||
|
||
const CONFIG = {
|
||
// Search criteria
|
||
city: '上海',
|
||
district: '徐汇区',
|
||
keyword: '交通大学',
|
||
|
||
// Dates
|
||
checkIn: '2025-12-24',
|
||
checkOut: '2026-01-22',
|
||
days: 29,
|
||
|
||
// Budget (RMB)
|
||
budgetIdeal: 4000,
|
||
budgetMax: 5000,
|
||
|
||
get dailyBudgetIdeal() {
|
||
return Math.ceil(this.budgetIdeal / 30 * this.days);
|
||
},
|
||
get dailyBudgetMax() {
|
||
return Math.ceil(this.budgetMax / 30 * this.days);
|
||
},
|
||
|
||
// Equipment
|
||
required: ['厨房', '冰箱'],
|
||
bonus: ['洗衣机', '地铁'],
|
||
|
||
// Scraping config
|
||
maxScrolls: 10,
|
||
scrollDelay: 2000,
|
||
interactionDelay: 1000,
|
||
|
||
// Output
|
||
outputFile: './xiaozhu_results.json',
|
||
outputMarkdown: './xiaozhu_results.md',
|
||
topN: 20,
|
||
|
||
// Debug
|
||
headless: true,
|
||
screenshots: true
|
||
};
|
||
|
||
console.log('🚀 Xiaozhu Interactive Scraper');
|
||
console.log(`📍 Target: ${CONFIG.city} ${CONFIG.district}`);
|
||
console.log(`📅 Dates: ${CONFIG.checkIn} → ${CONFIG.checkOut} (${CONFIG.days} days)`);
|
||
console.log(`💰 Budget: ${CONFIG.budgetIdeal}-${CONFIG.budgetMax} RMB/month (${CONFIG.dailyBudgetIdeal}-${CONFIG.dailyBudgetMax} RMB total)\n`);
|
||
|
||
async function wait(ms) {
|
||
return new Promise(resolve => setTimeout(resolve, ms));
|
||
}
|
||
|
||
async function screenshot(page, name) {
|
||
if (CONFIG.screenshots) {
|
||
const filename = `./xiaozhu_${name}_${Date.now()}.png`;
|
||
await page.screenshot({ path: filename, fullPage: true });
|
||
console.log(`📸 Screenshot: ${filename}`);
|
||
}
|
||
}
|
||
|
||
async function loadCookies() {
|
||
try {
|
||
const cookies = fs.readFileSync('./xiaozhu_cookies.json', 'utf8');
|
||
return JSON.parse(cookies);
|
||
} catch (err) {
|
||
console.log('⚠️ No cookies found (optional)');
|
||
return null;
|
||
}
|
||
}
|
||
|
||
async function scrapXiaozhu() {
|
||
const cookies = await loadCookies();
|
||
|
||
const browser = await puppeteer.launch({
|
||
headless: CONFIG.headless ? "new" : false,
|
||
defaultViewport: { width: 414, height: 896 }, // Mobile viewport (Xiaozhu is mobile-first)
|
||
args: [
|
||
'--no-sandbox',
|
||
'--disable-setuid-sandbox',
|
||
'--disable-dev-shm-usage'
|
||
]
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
|
||
// Mobile user agent
|
||
await page.setUserAgent('Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1');
|
||
|
||
// Load cookies if available
|
||
if (cookies && cookies.length > 0) {
|
||
try {
|
||
await page.setCookie(...cookies);
|
||
console.log(`🍪 Loaded ${cookies.length} cookies\n`);
|
||
} catch (err) {
|
||
console.log('⚠️ Could not load cookies:', err.message);
|
||
}
|
||
}
|
||
|
||
try {
|
||
console.log('🌐 Loading homepage...');
|
||
await page.goto('https://minsu.xiaozhu.com/', {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 30000
|
||
});
|
||
|
||
await wait(3000); // Wait for Vue app to initialize
|
||
await screenshot(page, 'homepage');
|
||
|
||
console.log('✅ Homepage loaded\n');
|
||
|
||
// Strategy 1: Look for search input
|
||
console.log('🔍 Looking for search input...');
|
||
|
||
const searchSelectors = [
|
||
'input[placeholder*="目的地"]',
|
||
'input[placeholder*="搜索"]',
|
||
'input[placeholder*="城市"]',
|
||
'.search-input',
|
||
'.van-search__content input',
|
||
'input[type="search"]',
|
||
'input[type="text"]'
|
||
];
|
||
|
||
let searchInput = null;
|
||
let inputSelector = null;
|
||
|
||
for (const selector of searchSelectors) {
|
||
try {
|
||
const element = await page.$(selector);
|
||
if (element) {
|
||
const isVisible = await page.evaluate(el => {
|
||
const rect = el.getBoundingClientRect();
|
||
return rect.width > 0 && rect.height > 0;
|
||
}, element);
|
||
|
||
if (isVisible) {
|
||
searchInput = element;
|
||
inputSelector = selector;
|
||
console.log(`✅ Found search input: ${selector}`);
|
||
break;
|
||
}
|
||
}
|
||
} catch (e) {}
|
||
}
|
||
|
||
if (searchInput) {
|
||
console.log('⌨️ Entering search query...');
|
||
|
||
// Click to focus
|
||
await searchInput.click();
|
||
await wait(500);
|
||
|
||
// Type search query - try just city first
|
||
await searchInput.type(`${CONFIG.city}`, { delay: 150 });
|
||
await wait(CONFIG.interactionDelay * 2); // Wait longer for suggestions to load
|
||
|
||
await screenshot(page, 'search_typed');
|
||
|
||
// Look for search suggestions or submit button
|
||
console.log('👆 Looking for search button or suggestions...');
|
||
|
||
const submitSelectors = [
|
||
'button[type="submit"]',
|
||
'.search-button',
|
||
'.van-button--primary',
|
||
'button.submit',
|
||
'.search-btn'
|
||
];
|
||
|
||
let submitted = false;
|
||
|
||
// Try to click suggestions first
|
||
await wait(1500);
|
||
|
||
// Look for suggestions containing Shanghai
|
||
const shanghaiClicked = await page.evaluate((city) => {
|
||
const suggestions = document.querySelectorAll('.van-cell, .suggestion-item, [class*="suggest"], .city-item, div[class*="item"]');
|
||
for (const sugg of suggestions) {
|
||
if (sugg.textContent.includes(city)) {
|
||
sugg.click();
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}, CONFIG.city);
|
||
|
||
if (shanghaiClicked) {
|
||
console.log(` Clicked suggestion containing ${CONFIG.city}`);
|
||
submitted = true;
|
||
await wait(4000);
|
||
} else {
|
||
console.log(` No ${CONFIG.city} suggestion found, trying all suggestions...`);
|
||
const suggestions = await page.$$('.van-cell, .suggestion-item, [class*="suggest"]');
|
||
if (suggestions.length > 0) {
|
||
console.log(` Found ${suggestions.length} suggestions, clicking first...`);
|
||
await suggestions[0].click();
|
||
submitted = true;
|
||
await wait(3000);
|
||
}
|
||
}
|
||
|
||
// If no suggestions, try submit button
|
||
if (!submitted) {
|
||
for (const selector of submitSelectors) {
|
||
try {
|
||
const button = await page.$(selector);
|
||
if (button) {
|
||
console.log(` Clicking submit: ${selector}`);
|
||
await button.click();
|
||
submitted = true;
|
||
await wait(3000);
|
||
break;
|
||
}
|
||
} catch (e) {}
|
||
}
|
||
}
|
||
|
||
// If still not submitted, try pressing Enter
|
||
if (!submitted) {
|
||
console.log(' Pressing Enter...');
|
||
await page.keyboard.press('Enter');
|
||
await wait(3000);
|
||
}
|
||
|
||
await screenshot(page, 'after_search');
|
||
|
||
} else {
|
||
// Strategy 2: Look for city/location selector
|
||
console.log('❌ No search input found');
|
||
console.log('🔍 Looking for city selector...');
|
||
|
||
const citySelectors = [
|
||
'a:contains("上海")',
|
||
'div:contains("上海")',
|
||
'.city-item',
|
||
'[data-city="shanghai"]'
|
||
];
|
||
|
||
// Try to find and click Shanghai
|
||
const cityFound = await page.evaluate((city) => {
|
||
const elements = Array.from(document.querySelectorAll('a, div, span'));
|
||
const shanghaEl = elements.find(el =>
|
||
el.textContent.trim() === city &&
|
||
el.getBoundingClientRect().width > 0
|
||
);
|
||
|
||
if (shanghaEl) {
|
||
shanghaEl.click();
|
||
return true;
|
||
}
|
||
return false;
|
||
}, CONFIG.city);
|
||
|
||
if (cityFound) {
|
||
console.log('✅ Clicked Shanghai');
|
||
await wait(3000);
|
||
} else {
|
||
console.log('⚠️ Could not find city selector');
|
||
}
|
||
}
|
||
|
||
// Current URL after navigation
|
||
let currentUrl = page.url();
|
||
console.log(`\n📍 Current URL: ${currentUrl}`);
|
||
|
||
// If we're on /suggest page, try to find and click Shanghai
|
||
if (currentUrl.includes('/suggest')) {
|
||
console.log('⚠️ On suggestions page, looking for Shanghai option...\n');
|
||
|
||
const shanghaiFound = await page.evaluate((city) => {
|
||
// Look for Shanghai in hot recommendations or administrative areas
|
||
const items = document.querySelectorAll('.city-hot-item, .city-hot-item2, .city-item, div[class*="item"]');
|
||
for (const item of items) {
|
||
const text = item.textContent.trim();
|
||
if (text === city || text.includes(city)) {
|
||
console.log(`Found ${city} option: ${text}`);
|
||
item.click();
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}, CONFIG.city);
|
||
|
||
if (shanghaiFound) {
|
||
console.log(` ✅ Clicked ${CONFIG.city} from suggestions`);
|
||
await wait(4000);
|
||
currentUrl = page.url();
|
||
console.log(` 📍 New URL: ${currentUrl}`);
|
||
} else {
|
||
console.log(` ❌ ${CONFIG.city} not found in suggestions`);
|
||
console.log(` 💡 Try searching for just the city name next time\n`);
|
||
}
|
||
}
|
||
|
||
await screenshot(page, 'before_extraction');
|
||
console.log('');
|
||
|
||
// Extract listings
|
||
console.log('📊 Extracting listings...\n');
|
||
|
||
let allListings = [];
|
||
let previousCount = 0;
|
||
let noChangeCount = 0;
|
||
|
||
// Scroll to load more listings (lazy loading)
|
||
for (let i = 0; i < CONFIG.maxScrolls; i++) {
|
||
console.log(`🔄 Scroll ${i + 1}/${CONFIG.maxScrolls}...`);
|
||
|
||
// Extract current listings
|
||
const listings = await page.evaluate(() => {
|
||
const results = [];
|
||
|
||
// Possible selectors for listing cards
|
||
const selectors = [
|
||
'.house-item',
|
||
'.room-item',
|
||
'.van-card',
|
||
'[class*="house"]',
|
||
'[class*="room"]',
|
||
'[class*="card"]'
|
||
];
|
||
|
||
let items = [];
|
||
for (const sel of selectors) {
|
||
const elements = document.querySelectorAll(sel);
|
||
if (elements.length > items.length) {
|
||
items = Array.from(elements);
|
||
}
|
||
}
|
||
|
||
items.forEach((item, idx) => {
|
||
try {
|
||
const listing = {
|
||
index: idx + 1,
|
||
html: item.innerHTML.substring(0, 500),
|
||
text: item.textContent.trim().substring(0, 300)
|
||
};
|
||
|
||
// Extract title
|
||
const titleEl = item.querySelector('h2, h3, h4, .title, .name, .van-card__title, [class*="title"]');
|
||
if (titleEl) {
|
||
listing.title = titleEl.textContent.trim();
|
||
}
|
||
|
||
// Extract price
|
||
const pricePatterns = [
|
||
'.price', '.van-card__price', '[class*="price"]',
|
||
'span:contains("¥")', 'span:contains("元")'
|
||
];
|
||
|
||
for (const pattern of pricePatterns) {
|
||
const priceEl = item.querySelector(pattern);
|
||
if (priceEl) {
|
||
const priceText = priceEl.textContent;
|
||
const match = priceText.match(/(\d+)/);
|
||
if (match) {
|
||
listing.priceDaily = parseInt(match[1]);
|
||
listing.priceText = priceText.trim();
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// If no price found, search in all text
|
||
if (!listing.priceDaily) {
|
||
const priceMatch = item.textContent.match(/[¥¥]?\s*(\d+)\s*[元\/]/);
|
||
if (priceMatch) {
|
||
listing.priceDaily = parseInt(priceMatch[1]);
|
||
}
|
||
}
|
||
|
||
// Extract location
|
||
const locationEl = item.querySelector('.location, .address, .area, [class*="location"]');
|
||
if (locationEl) {
|
||
listing.location = locationEl.textContent.trim();
|
||
}
|
||
|
||
// Extract URL
|
||
const linkEl = item.querySelector('a');
|
||
if (linkEl) {
|
||
listing.url = linkEl.href;
|
||
}
|
||
|
||
// Extract image
|
||
const imgEl = item.querySelector('img');
|
||
if (imgEl) {
|
||
listing.image = imgEl.src;
|
||
}
|
||
|
||
// Check equipment mentions in text
|
||
const fullText = item.textContent.toLowerCase();
|
||
listing.hasKitchen = fullText.includes('厨房') || fullText.includes('kitchen');
|
||
listing.hasFridge = fullText.includes('冰箱') || fullText.includes('fridge');
|
||
listing.hasWashingMachine = fullText.includes('洗衣机') || fullText.includes('washing');
|
||
listing.hasMetro = fullText.includes('地铁') || fullText.includes('metro') || fullText.includes('站');
|
||
|
||
results.push(listing);
|
||
} catch (e) {
|
||
console.error('Error extracting listing:', e);
|
||
}
|
||
});
|
||
|
||
return results;
|
||
});
|
||
|
||
// Merge with previous (avoid duplicates by URL)
|
||
const newListings = listings.filter(l =>
|
||
!allListings.some(existing => existing.url === l.url && l.url)
|
||
);
|
||
|
||
allListings = [...allListings, ...newListings];
|
||
|
||
console.log(` Found ${listings.length} on page, ${newListings.length} new, ${allListings.length} total`);
|
||
|
||
// Check if we got new listings
|
||
if (allListings.length === previousCount) {
|
||
noChangeCount++;
|
||
if (noChangeCount >= 3) {
|
||
console.log(' No new listings for 3 scrolls, stopping...');
|
||
break;
|
||
}
|
||
} else {
|
||
noChangeCount = 0;
|
||
}
|
||
|
||
previousCount = allListings.length;
|
||
|
||
// Scroll down
|
||
await page.evaluate(() => {
|
||
window.scrollBy(0, window.innerHeight);
|
||
});
|
||
|
||
await wait(CONFIG.scrollDelay);
|
||
}
|
||
|
||
await screenshot(page, 'final');
|
||
|
||
console.log(`\n✅ Total extracted: ${allListings.length} listings\n`);
|
||
|
||
if (allListings.length === 0) {
|
||
console.log('❌ No listings found!');
|
||
console.log('💾 Saving page HTML for inspection...');
|
||
|
||
const html = await page.content();
|
||
fs.writeFileSync('./xiaozhu_interactive_page.html', html);
|
||
|
||
console.log('\n📋 Page info:');
|
||
const pageInfo = await page.evaluate(() => ({
|
||
url: window.location.href,
|
||
title: document.title,
|
||
bodyText: document.body.textContent.substring(0, 500),
|
||
elementCount: document.querySelectorAll('*').length
|
||
}));
|
||
|
||
console.log(` URL: ${pageInfo.url}`);
|
||
console.log(` Title: ${pageInfo.title}`);
|
||
console.log(` Elements: ${pageInfo.elementCount}`);
|
||
console.log(` Body preview: ${pageInfo.bodyText.substring(0, 200)}...`);
|
||
|
||
} else {
|
||
// Process and filter listings
|
||
const processed = processListings(allListings);
|
||
|
||
// Save results
|
||
fs.writeFileSync(CONFIG.outputFile, JSON.stringify(processed, null, 2));
|
||
console.log(`💾 Results saved: ${CONFIG.outputFile}`);
|
||
|
||
const markdown = generateMarkdown(processed);
|
||
fs.writeFileSync(CONFIG.outputMarkdown, markdown);
|
||
console.log(`📝 Markdown saved: ${CONFIG.outputMarkdown}`);
|
||
|
||
// Print top results
|
||
printTopResults(processed);
|
||
}
|
||
|
||
} catch (err) {
|
||
console.error('❌ Error:', err.message);
|
||
console.error(err.stack);
|
||
|
||
await screenshot(page, 'error');
|
||
} finally {
|
||
if (CONFIG.headless) {
|
||
await browser.close();
|
||
} else {
|
||
console.log('\n⏸️ Browser kept open for inspection. Close manually when done.');
|
||
}
|
||
}
|
||
}
|
||
|
||
function processListings(listings) {
|
||
return listings
|
||
.filter(l => l.priceDaily && l.priceDaily > 0)
|
||
.map(l => {
|
||
// Calculate total price
|
||
l.priceTotal = l.priceDaily * CONFIG.days;
|
||
l.priceMonthly = Math.ceil(l.priceDaily * 30);
|
||
|
||
// Score
|
||
let score = 0;
|
||
|
||
// Price scoring
|
||
if (l.priceTotal <= CONFIG.dailyBudgetIdeal) {
|
||
score += (CONFIG.dailyBudgetIdeal - l.priceTotal) / 100;
|
||
} else if (l.priceTotal <= CONFIG.dailyBudgetMax) {
|
||
score -= (l.priceTotal - CONFIG.dailyBudgetIdeal) / 50;
|
||
} else {
|
||
score -= 100;
|
||
}
|
||
|
||
// Equipment bonuses
|
||
if (l.hasKitchen) score += 20;
|
||
if (l.hasFridge) score += 15;
|
||
if (l.hasWashingMachine) score += 10;
|
||
if (l.hasMetro) score += 15;
|
||
|
||
// Location bonus
|
||
if (l.location) {
|
||
if (l.location.includes(CONFIG.district)) score += 20;
|
||
if (l.location.includes(CONFIG.keyword)) score += 10;
|
||
}
|
||
if (l.title) {
|
||
if (l.title.includes(CONFIG.keyword)) score += 10;
|
||
}
|
||
|
||
l.score = Math.round(score * 10) / 10;
|
||
return l;
|
||
})
|
||
.filter(l => l.hasKitchen && l.hasFridge) // Required
|
||
.filter(l => l.priceTotal <= CONFIG.dailyBudgetMax) // Budget
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, CONFIG.topN);
|
||
}
|
||
|
||
function generateMarkdown(listings) {
|
||
let md = '# Xiaozhu Search Results - Interactive Scraper\n\n';
|
||
md += `**Date:** ${new Date().toLocaleDateString()}\n`;
|
||
md += `**Location:** ${CONFIG.city} ${CONFIG.district}\n`;
|
||
md += `**Dates:** ${CONFIG.checkIn} → ${CONFIG.checkOut} (${CONFIG.days} days)\n`;
|
||
md += `**Budget:** ${CONFIG.budgetIdeal}-${CONFIG.budgetMax} RMB/month\n\n`;
|
||
|
||
md += '| # | Title | Daily | Total | Kitchen | Fridge | Washer | Metro | Score | Link |\n';
|
||
md += '|---|-------|-------|-------|---------|--------|--------|-------|-------|------|\n';
|
||
|
||
listings.forEach((l, i) => {
|
||
md += `| ${i + 1} `;
|
||
md += `| ${(l.title || 'Untitled').substring(0, 40)} `;
|
||
md += `| ¥${l.priceDaily} `;
|
||
md += `| ¥${l.priceTotal} `;
|
||
md += `| ${l.hasKitchen ? '✓' : '✗'} `;
|
||
md += `| ${l.hasFridge ? '✓' : '✗'} `;
|
||
md += `| ${l.hasWashingMachine ? '✓' : '✗'} `;
|
||
md += `| ${l.hasMetro ? '✓' : '✗'} `;
|
||
md += `| ${l.score} `;
|
||
md += `| ${l.url ? `[View](${l.url})` : '-'} |\n`;
|
||
});
|
||
|
||
return md;
|
||
}
|
||
|
||
function printTopResults(listings) {
|
||
console.log('\n🏆 TOP RESULTS:\n');
|
||
|
||
listings.slice(0, 5).forEach((l, i) => {
|
||
console.log(`${i + 1}. ${l.title || 'Untitled'}`);
|
||
console.log(` 💰 ¥${l.priceDaily}/day × ${CONFIG.days} days = ¥${l.priceTotal} total (~¥${l.priceMonthly}/month)`);
|
||
if (l.location) console.log(` 📍 ${l.location}`);
|
||
console.log(` ✓ Kitchen: ${l.hasKitchen ? '✓' : '✗'} | Fridge: ${l.hasFridge ? '✓' : '✗'} | Washer: ${l.hasWashingMachine ? '✓' : '✗'} | Metro: ${l.hasMetro ? '✓' : '✗'}`);
|
||
console.log(` ⭐ Score: ${l.score}`);
|
||
if (l.url) console.log(` 🔗 ${l.url}`);
|
||
console.log('');
|
||
});
|
||
}
|
||
|
||
// Run
|
||
scrapXiaozhu().catch(console.error);
|