personal-hub/tools/xiaozhu_fixed.js
StillHammer 3c8162c990 Sync couple_matters: December crisis, separation agreement, daily check v2, xiaozhu search
Major updates:
- December 2025 crisis documentation and separation agreement
- Daily check system v2 with multiple card categories
- Xiaozhu rental search tools and results
- Exit plan documentation
- Message drafts for family communication
- Confluent moved to CONSTANT
- Updated profiles and promises

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-23 07:04:02 +08:00

667 lines
23 KiB
JavaScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

const puppeteer = require('puppeteer');
const fs = require('fs');
/**
* Xiaozhu Fixed Scraper - With geolocation override and smart navigation
* Fixes: Geolocation to Shanghai, better suggestion detection, city verification
*/
const CONFIG = {
// Location - Search specifically for Jiaotong University area
city: '上海',
searchQuery: '交通大学', // Just the university name for better suggestions
cityEnglish: 'shanghai',
district: '徐汇区',
keyword: '交通大学',
// Shanghai Xujiahui coordinates
latitude: 31.1880,
longitude: 121.4367,
// Dates
checkIn: '2025-12-24',
checkOut: '2026-01-22',
days: 29,
// Budget
budgetIdeal: 4000,
budgetMax: 5000,
get dailyBudgetIdeal() {
return Math.ceil(this.budgetIdeal / 30 * this.days);
},
get dailyBudgetMax() {
return Math.ceil(this.budgetMax / 30 * this.days);
},
// Equipment
required: ['厨房', '冰箱'],
bonus: ['洗衣机', '地铁'],
// Scraping - More aggressive to load everything
maxScrolls: 50,
scrollDelay: 3500, // Longer wait for lazy load
interactionDelay: 1000,
noChangeThreshold: 7, // Wait 7 scrolls without change before stopping
// Output
outputFile: './xiaozhu_results.json',
outputMarkdown: './xiaozhu_results.md',
topN: 20,
// Debug
headless: true,
screenshots: true
};
console.log('🚀 Xiaozhu FIXED Scraper - Jiaotong University Focus');
console.log(`📍 Search: ${CONFIG.searchQuery}`);
console.log(`🎯 Target: ${CONFIG.keyword} (${CONFIG.district})`);
console.log(`🌍 Geolocation: ${CONFIG.latitude}, ${CONFIG.longitude}`);
console.log(`📅 Dates: ${CONFIG.checkIn}${CONFIG.checkOut} (${CONFIG.days} days)`);
console.log(`💰 Budget: ${CONFIG.budgetIdeal}-${CONFIG.budgetMax} RMB/month\n`);
async function wait(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function screenshot(page, name) {
if (CONFIG.screenshots) {
const filename = `./xiaozhu_${name}_${Date.now()}.png`;
await page.screenshot({ path: filename, fullPage: true });
console.log(`📸 ${filename}`);
}
}
async function loadCookies() {
try {
const cookies = fs.readFileSync('./xiaozhu_cookies.json', 'utf8');
return JSON.parse(cookies);
} catch (err) {
return null;
}
}
async function scrapXiaozhu() {
const cookies = await loadCookies();
const browser = await puppeteer.launch({
headless: CONFIG.headless ? "new" : false,
defaultViewport: { width: 414, height: 896 },
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage'
]
});
const page = await browser.newPage();
// Mobile user agent
await page.setUserAgent('Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1');
// ===== FIX 1: Override geolocation to Shanghai =====
console.log('🌍 Setting geolocation to Shanghai Xujiahui...');
const context = browser.defaultBrowserContext();
await context.overridePermissions('https://minsu.xiaozhu.com', ['geolocation']);
await page.setGeolocation({
latitude: CONFIG.latitude,
longitude: CONFIG.longitude,
accuracy: 100
});
console.log(`✅ Geolocation set to ${CONFIG.latitude}, ${CONFIG.longitude}\n`);
// Load cookies
if (cookies && cookies.length > 0) {
try {
await page.setCookie(...cookies);
console.log(`🍪 Loaded ${cookies.length} cookies\n`);
} catch (err) {
console.log('⚠️ Cookie error:', err.message);
}
}
try {
// ===== FIX 2: Try direct URL first =====
console.log('🔍 Strategy 1: Trying direct Shanghai URL...');
const directUrls = [
`https://minsu.xiaozhu.com/${CONFIG.cityEnglish}`,
`https://minsu.xiaozhu.com/city/${CONFIG.cityEnglish}`,
`https://minsu.xiaozhu.com/search/${CONFIG.cityEnglish}`,
`https://minsu.xiaozhu.com/shanghai/${CONFIG.district}`
];
let successUrl = null;
for (const url of directUrls) {
try {
console.log(` Trying: ${url}`);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 15000 });
await wait(2000);
const is404 = await page.evaluate(() => {
return document.body.textContent.includes('404') ||
document.body.textContent.includes('找不到');
});
if (!is404) {
console.log(` ✅ Success!`);
successUrl = url;
await screenshot(page, 'direct_url_success');
break;
} else {
console.log(` ❌ 404`);
}
} catch (e) {
console.log(` ❌ Failed: ${e.message}`);
}
}
// If direct URL failed, use homepage search
if (!successUrl) {
console.log('\n🔍 Strategy 2: Homepage search with geolocation...');
await page.goto('https://minsu.xiaozhu.com/', {
waitUntil: 'networkidle2',
timeout: 30000
});
await wait(3000);
await screenshot(page, 'homepage');
// Check if geolocation worked and we see Shanghai content
const cityDetected = await page.evaluate(() => {
const bodyText = document.body.textContent;
if (bodyText.includes('上海') || bodyText.includes('Shanghai')) {
return '上海';
} else if (bodyText.includes('北京') || bodyText.includes('Beijing')) {
return '北京';
}
return 'unknown';
});
console.log(` Detected city: ${cityDetected}`);
// Find search input
console.log('\n⌨ Using search...');
const searchSelectors = [
'input[placeholder*="目的地"]',
'input[placeholder*="搜索"]',
'input[type="search"]',
'input[type="text"]'
];
let searchInput = null;
for (const selector of searchSelectors) {
searchInput = await page.$(selector);
if (searchInput) {
console.log(` Found: ${selector}`);
break;
}
}
if (searchInput) {
await searchInput.click();
await wait(500);
// ===== FIX 3: Clear any pre-filled text first =====
await page.keyboard.down('Control');
await page.keyboard.press('A');
await page.keyboard.up('Control');
await page.keyboard.press('Backspace');
// Type specific search query for Jiaotong University
await searchInput.type(CONFIG.searchQuery, { delay: 150 });
await wait(2000); // Wait for suggestions
await screenshot(page, 'search_typed');
// ===== FIX 4: Smart suggestion detection =====
console.log(`\n👆 Looking for suggestions matching "${CONFIG.searchQuery}"...`);
const shanghaiClicked = await page.evaluate((searchQuery, keyword) => {
// Look for suggestions containing our keyword (交通大学)
const allElements = document.querySelectorAll('div, li, a, span');
const matchingElements = [];
for (const el of allElements) {
const text = el.textContent.trim();
const rect = el.getBoundingClientRect();
// Must be visible
if (rect.width > 0 && rect.height > 0) {
// Prioritize exact keyword match
if (text.includes(keyword)) {
matchingElements.push({ el, text, score: 100 });
}
// Or search query match
else if (text.includes(searchQuery)) {
matchingElements.push({ el, text, score: 80 });
}
// Or contains Shanghai
else if (text.includes('上海') && text.length < 15) {
matchingElements.push({ el, text, score: 30 });
}
}
}
// Sort by score and click best match
matchingElements.sort((a, b) => b.score - a.score);
if (matchingElements.length > 0) {
console.log(`Found ${matchingElements.length} matching elements, clicking best: "${matchingElements[0].text}"`);
matchingElements[0].el.click();
return true;
}
return false;
}, CONFIG.searchQuery, CONFIG.keyword);
if (shanghaiClicked) {
console.log(' ✅ Clicked matching suggestion');
await wait(4000);
await screenshot(page, 'after_suggestion');
} else {
console.log(' ⚠️ No matching suggestion, pressing Enter...');
await page.keyboard.press('Enter');
await wait(3000);
}
}
}
// ===== FIX 5: Verify we're on Shanghai, if not, fix it =====
let currentUrl = page.url();
console.log(`\n📍 Current URL: ${currentUrl}`);
const cityCheck = await page.evaluate(() => {
const text = document.body.textContent;
return {
hasShanghai: text.includes('上海') || text.includes('Shanghai'),
hasBeijing: text.includes('北京') || text.includes('Beijing') ||
text.includes('天安门') || text.includes('朝阳'),
bodyPreview: text.substring(0, 300)
};
});
console.log(` Shanghai content: ${cityCheck.hasShanghai ? '✅' : '❌'}`);
console.log(` Beijing content: ${cityCheck.hasBeijing ? '⚠️ YES' : '✅ No'}`);
if (cityCheck.hasBeijing && !cityCheck.hasShanghai) {
console.log('\n🔧 Detected Beijing, attempting to switch to Shanghai...');
// Try to find Shanghai in the page
const switched = await page.evaluate((city) => {
// Look for any clickable Shanghai element
const elements = Array.from(document.querySelectorAll('a, div, span, button'));
for (const el of elements) {
const text = el.textContent.trim();
if ((text === city || text === city + '市') && el.getBoundingClientRect().width > 0) {
console.log(`Clicking: "${text}"`);
el.click();
return true;
}
}
// Try searching in a visible input
const inputs = document.querySelectorAll('input[type="text"], input[type="search"]');
for (const input of inputs) {
if (input.getBoundingClientRect().width > 0) {
input.value = city;
input.dispatchEvent(new Event('input', { bubbles: true }));
// Try to submit
const form = input.closest('form');
if (form) {
form.dispatchEvent(new Event('submit', { bubbles: true }));
return true;
}
// Or press Enter
const enterEvent = new KeyboardEvent('keydown', {
key: 'Enter',
code: 'Enter',
keyCode: 13,
bubbles: true
});
input.dispatchEvent(enterEvent);
return true;
}
}
return false;
}, CONFIG.city);
if (switched) {
console.log(' ✅ Triggered Shanghai switch');
await wait(4000);
await screenshot(page, 'after_switch');
currentUrl = page.url();
console.log(` 📍 New URL: ${currentUrl}`);
} else {
console.log(' ❌ Could not find Shanghai option');
}
}
// Extract listings
console.log('\n📊 Extracting listings...\n');
let allListings = [];
let previousCount = 0;
let noChangeCount = 0;
// FIRST: Scroll to bottom to trigger all lazy loading at once
console.log('⏬ Scrolling to page bottom to trigger lazy load...');
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
});
await wait(5000); // Wait for initial load
// SECOND: Progressive scrolling to load more
console.log(`⏳ Progressive scrolling (max ${CONFIG.maxScrolls} scrolls, ${CONFIG.noChangeThreshold} patience)...\n`);
for (let i = 0; i < CONFIG.maxScrolls; i++) {
const progress = Math.round((i / CONFIG.maxScrolls) * 100);
console.log(`🔄 Scroll ${i + 1}/${CONFIG.maxScrolls} (${progress}%)...`);
const listings = await page.evaluate(() => {
const results = [];
const selectors = [
'.list-item', // PRIMARY - Found in HTML analysis
'.house-item', '.room-item', '.van-card',
'[class*="list-item"]', '[class*="house"]', '[class*="room"]'
];
let items = [];
for (const sel of selectors) {
const elements = document.querySelectorAll(sel);
if (elements.length > 0 && elements.length < 200) {
items = Array.from(elements);
console.log(`Using selector: ${sel} (${elements.length} items)`);
break; // Use first working selector
}
}
items.forEach((item, idx) => {
if (idx >= 50) return;
const listing = { index: idx + 1 };
// Debug: log all attributes of first item
if (idx === 0) {
console.log('DEBUG First item attributes:', {
className: item.className,
id: item.id,
attributes: Array.from(item.attributes || []).map(a => `${a.name}=${a.value}`),
innerHTML: item.innerHTML.substring(0, 200)
});
}
// Title - Try specific Xiaozhu classes first
const titleEl = item.querySelector('.list-title, h2, h3, h4, .title, .name, [class*="title"]');
if (titleEl) listing.title = titleEl.textContent.trim();
// Price - Try specific Xiaozhu classes first
const priceEl = item.querySelector('.list-price, .price-left, .price, [class*="price"]');
if (priceEl) {
const match = priceEl.textContent.match(/(\d+)/);
if (match) {
listing.priceDaily = parseInt(match[1]);
listing.priceText = priceEl.textContent.trim();
}
}
if (!listing.priceDaily) {
const match = item.textContent.match(/[¥¥]?\s*(\d+)\s*[元\/晚]/);
if (match) listing.priceDaily = parseInt(match[1]);
}
// Location - Extract from content or title
const contentEl = item.querySelector('.list-content, .content, .location, .address');
if (contentEl) listing.location = contentEl.textContent.trim();
// Also check title for location keywords
if (!listing.location && listing.title) {
listing.location = listing.title;
}
// URL - Try multiple approaches
// 1. Direct link
const linkEl = item.querySelector('a');
if (linkEl && linkEl.href && linkEl.href !== 'javascript:;') {
listing.url = linkEl.href;
}
// 2. Data attributes (房源ID / listing ID)
if (!listing.url) {
const dataId = item.getAttribute('data-id') ||
item.getAttribute('data-house-id') ||
item.getAttribute('data-fid');
if (dataId) {
listing.url = `https://minsu.xiaozhu.com/house/${dataId}`;
listing.houseId = dataId;
}
}
// 3. Look for ID in onclick or other attributes
if (!listing.url) {
const onclick = item.getAttribute('onclick') || item.getAttribute('@click');
if (onclick) {
const idMatch = onclick.match(/\d{6,}/);
if (idMatch) {
listing.url = `https://minsu.xiaozhu.com/house/${idMatch[0]}`;
listing.houseId = idMatch[0];
}
}
}
// 4. Check child elements for router-link
if (!listing.url) {
const routerLink = item.querySelector('[to], [router-link]');
if (routerLink) {
const to = routerLink.getAttribute('to') || routerLink.getAttribute('router-link');
if (to) {
listing.url = `https://minsu.xiaozhu.com${to}`;
}
}
}
// Image
const imgEl = item.querySelector('img');
if (imgEl) listing.image = imgEl.src;
// Equipment (check Chinese text, not lowercased)
const fullText = item.textContent;
listing.hasKitchen = fullText.includes('厨房') || fullText.includes('可做饭') || fullText.includes('可烧饭');
listing.hasFridge = fullText.includes('冰箱') || fullText.includes('冷藏');
listing.hasWashingMachine = fullText.includes('洗衣机');
listing.hasMetro = fullText.includes('地铁') || fullText.includes('站');
if (listing.title || listing.priceDaily) {
results.push(listing);
}
});
return results;
});
// Better duplicate detection (URL or title+price)
const newListings = listings.filter(l => {
const isDuplicate = allListings.some(existing => {
// By URL if available
if (l.url && existing.url && l.url === existing.url) return true;
// By title + price combination
if (l.title && existing.title && l.priceDaily && existing.priceDaily) {
return l.title === existing.title && l.priceDaily === existing.priceDaily;
}
return false;
});
return !isDuplicate;
});
allListings = [...allListings, ...newListings];
console.log(` Found ${listings.length} items, ${newListings.length} new, ${allListings.length} total`);
if (allListings.length === previousCount) {
noChangeCount++;
if (noChangeCount >= CONFIG.noChangeThreshold) {
console.log(` No new listings for ${CONFIG.noChangeThreshold} scrolls, stopping...`);
break;
}
} else {
noChangeCount = 0;
}
previousCount = allListings.length;
// Scroll down
await page.evaluate(() => window.scrollBy(0, window.innerHeight));
// Wait for loading indicators to disappear
try {
await page.waitForFunction(() => {
// Check for common loading indicators
const loadingEls = document.querySelectorAll('.loading, .spinner, [class*="loading"]');
return loadingEls.length === 0 || Array.from(loadingEls).every(el => el.style.display === 'none');
}, { timeout: 2000 });
} catch (e) {
// No loading indicator found, that's fine
}
// Additional wait for lazy load
await wait(CONFIG.scrollDelay);
}
await screenshot(page, 'final');
console.log(`\n✅ Total extracted: ${allListings.length} listings\n`);
// Save raw listings for debug
fs.writeFileSync('./xiaozhu_raw_listings.json', JSON.stringify(allListings, null, 2));
console.log('💾 Raw listings saved to xiaozhu_raw_listings.json\n');
if (allListings.length === 0) {
console.log('❌ No listings found!');
const html = await page.content();
fs.writeFileSync('./xiaozhu_fixed_page.html', html);
console.log('💾 Saved HTML to xiaozhu_fixed_page.html');
const pageInfo = await page.evaluate(() => ({
url: window.location.href,
title: document.title,
hasShanghai: document.body.textContent.includes('上海'),
hasBeijing: document.body.textContent.includes('北京'),
bodyPreview: document.body.textContent.substring(0, 500)
}));
console.log('\n📋 Page diagnosis:');
console.log(` URL: ${pageInfo.url}`);
console.log(` Title: ${pageInfo.title}`);
console.log(` Has Shanghai: ${pageInfo.hasShanghai ? '✅' : '❌'}`);
console.log(` Has Beijing: ${pageInfo.hasBeijing ? '⚠️' : '✅'}`);
console.log(` Preview: ${pageInfo.bodyPreview.substring(0, 200)}...`);
} else {
const processed = processListings(allListings);
fs.writeFileSync(CONFIG.outputFile, JSON.stringify(processed, null, 2));
console.log(`💾 ${CONFIG.outputFile}`);
const markdown = generateMarkdown(processed);
fs.writeFileSync(CONFIG.outputMarkdown, markdown);
console.log(`📝 ${CONFIG.outputMarkdown}`);
printTopResults(processed);
}
} catch (err) {
console.error('❌ Error:', err.message);
await screenshot(page, 'error');
} finally {
if (CONFIG.headless) {
await browser.close();
}
}
}
function processListings(listings) {
return listings
.filter(l => l.priceDaily && l.priceDaily > 0)
.map(l => {
l.priceTotal = l.priceDaily * CONFIG.days;
l.priceMonthly = Math.ceil(l.priceDaily * 30);
let score = 0;
if (l.priceTotal <= CONFIG.dailyBudgetIdeal) {
score += (CONFIG.dailyBudgetIdeal - l.priceTotal) / 100;
} else if (l.priceTotal <= CONFIG.dailyBudgetMax) {
score -= (l.priceTotal - CONFIG.dailyBudgetIdeal) / 50;
} else {
score -= 100;
}
if (l.hasKitchen) score += 20;
if (l.hasFridge) score += 15;
if (l.hasWashingMachine) score += 10;
if (l.hasMetro) score += 15;
if (l.location) {
if (l.location.includes(CONFIG.district)) score += 20;
if (l.location.includes(CONFIG.keyword)) score += 10;
}
if (l.title && l.title.includes(CONFIG.keyword)) score += 10;
l.score = Math.round(score * 10) / 10;
return l;
})
// Relax filtering - show results even without kitchen/fridge detected
// .filter(l => l.hasKitchen && l.hasFridge) // Too strict - equipment might be in icons
.filter(l => l.priceTotal <= CONFIG.dailyBudgetMax * 1.2) // Allow 20% over budget
.sort((a, b) => b.score - a.score)
.slice(0, CONFIG.topN);
}
function generateMarkdown(listings) {
let md = '# Xiaozhu Results - FIXED Scraper\n\n';
md += `**Date:** ${new Date().toLocaleDateString()}\n`;
md += `**Location:** ${CONFIG.city} ${CONFIG.district}\n`;
md += `**Dates:** ${CONFIG.checkIn}${CONFIG.checkOut} (${CONFIG.days} days)\n\n`;
md += '| # | Title | Daily | Total | Kitchen | Fridge | Washer | Metro | Score | Link |\n';
md += '|---|-------|-------|-------|---------|--------|--------|-------|-------|------|\n';
listings.forEach((l, i) => {
md += `| ${i + 1} `;
md += `| ${(l.title || 'Untitled').substring(0, 40)} `;
md += `| ¥${l.priceDaily} `;
md += `| ¥${l.priceTotal} `;
md += `| ${l.hasKitchen ? '✓' : '✗'} `;
md += `| ${l.hasFridge ? '✓' : '✗'} `;
md += `| ${l.hasWashingMachine ? '✓' : '✗'} `;
md += `| ${l.hasMetro ? '✓' : '✗'} `;
md += `| ${l.score} `;
md += `| ${l.url ? `[View](${l.url})` : '-'} |\n`;
});
return md;
}
function printTopResults(listings) {
console.log('\n🏆 TOP RESULTS:\n');
listings.slice(0, 5).forEach((l, i) => {
console.log(`${i + 1}. ${l.title || 'Untitled'}`);
console.log(` 💰 ¥${l.priceDaily}/day × ${CONFIG.days} days = ¥${l.priceTotal}`);
if (l.location) console.log(` 📍 ${l.location}`);
console.log(` ✓ Kitchen: ${l.hasKitchen ? '✓' : '✗'} | Fridge: ${l.hasFridge ? '✓' : '✗'} | Washer: ${l.hasWashingMachine ? '✓' : '✗'} | Metro: ${l.hasMetro ? '✓' : '✗'}`);
console.log(`${l.score}`);
if (l.url) console.log(` 🔗 ${l.url}`);
console.log('');
});
}
scrapXiaozhu().catch(console.error);