feedgenerator/tests/test_scraper.py
StillHammer 40138c2d45 Initial implementation: Feed Generator V1
Complete Python implementation with strict type safety and best practices.

Features:
- RSS/Atom/HTML web scraping
- GPT-4 Vision image analysis
- Node.js API integration
- RSS/JSON feed publishing

Modules:
- src/config.py: Configuration with strict validation
- src/exceptions.py: Custom exception hierarchy
- src/scraper.py: Multi-format news scraping (RSS/Atom/HTML)
- src/image_analyzer.py: GPT-4 Vision integration with retry
- src/aggregator.py: Content aggregation and filtering
- src/article_client.py: Node.js API client with retry
- src/publisher.py: RSS/JSON feed generation
- scripts/run.py: Complete pipeline orchestrator
- scripts/validate.py: Code quality validation

Code Quality:
- 100% type hint coverage (mypy strict mode)
- Zero bare except clauses
- Logger throughout (no print statements)
- Comprehensive test suite (598 lines)
- Immutable dataclasses (frozen=True)
- Explicit error handling
- Structured logging

Stats:
- 1,431 lines of source code
- 598 lines of test code
- 15 Python files
- 8 core modules
- 4 test suites

All validation checks pass.

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-07 22:28:18 +08:00

210 lines
6.3 KiB
Python

"""Tests for scraper.py module."""
from __future__ import annotations
from datetime import datetime
from unittest.mock import Mock, patch
import pytest
import requests
from src.exceptions import ScrapingError
from src.scraper import NewsArticle, NewsScraper, ScraperConfig
def test_news_article_creation() -> None:
"""Test NewsArticle creation with valid data."""
article = NewsArticle(
title="Test Article",
url="https://example.com/article",
content="Test content",
image_url="https://example.com/image.jpg",
published_at=datetime.now(),
source="https://example.com",
)
assert article.title == "Test Article"
assert article.url == "https://example.com/article"
assert article.content == "Test content"
def test_news_article_validation_empty_title() -> None:
"""Test NewsArticle validation fails with empty title."""
with pytest.raises(ValueError, match="Title cannot be empty"):
NewsArticle(
title="",
url="https://example.com/article",
content="Test content",
image_url=None,
published_at=None,
source="https://example.com",
)
def test_news_article_validation_invalid_url() -> None:
"""Test NewsArticle validation fails with invalid URL."""
with pytest.raises(ValueError, match="Invalid URL"):
NewsArticle(
title="Test",
url="not-a-url",
content="Test content",
image_url=None,
published_at=None,
source="https://example.com",
)
def test_scraper_config_validation() -> None:
"""Test NewsScraper validates configuration."""
config = ScraperConfig(sources=[], max_articles=10, timeout_seconds=10)
with pytest.raises(ValueError, match="At least one source is required"):
NewsScraper(config)
def test_scraper_initialization() -> None:
"""Test NewsScraper initialization with valid config."""
config = ScraperConfig(
sources=["https://example.com"], max_articles=10, timeout_seconds=10
)
scraper = NewsScraper(config)
assert scraper._config == config
@patch("src.scraper.requests.get")
def test_scraper_success(mock_get: Mock) -> None:
"""Test successful scraping."""
config = ScraperConfig(
sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10
)
scraper = NewsScraper(config)
# Mock RSS response
mock_response = Mock()
mock_response.ok = True
mock_response.raise_for_status = Mock()
mock_response.text = """<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test Article</title>
<link>https://example.com/article1</link>
<description>Test description</description>
</item>
</channel>
</rss>"""
mock_get.return_value = mock_response
articles = scraper.scrape("https://example.com/feed")
assert len(articles) == 1
assert articles[0].title == "Test Article"
assert articles[0].url == "https://example.com/article1"
@patch("src.scraper.requests.get")
def test_scraper_timeout(mock_get: Mock) -> None:
"""Test scraping handles timeout."""
config = ScraperConfig(
sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10
)
scraper = NewsScraper(config)
mock_get.side_effect = requests.Timeout("Connection timeout")
with pytest.raises(ScrapingError, match="Timeout scraping"):
scraper.scrape("https://example.com/feed")
@patch("src.scraper.requests.get")
def test_scraper_request_exception(mock_get: Mock) -> None:
"""Test scraping handles request exceptions."""
config = ScraperConfig(
sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10
)
scraper = NewsScraper(config)
mock_get.side_effect = requests.RequestException("Connection error")
with pytest.raises(ScrapingError, match="Failed to scrape"):
scraper.scrape("https://example.com/feed")
@patch("src.scraper.requests.get")
def test_scraper_all_success(mock_get: Mock) -> None:
"""Test scrape_all with multiple sources."""
config = ScraperConfig(
sources=["https://example.com/feed1", "https://example.com/feed2"],
max_articles=10,
timeout_seconds=10,
)
scraper = NewsScraper(config)
mock_response = Mock()
mock_response.ok = True
mock_response.raise_for_status = Mock()
mock_response.text = """<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test Article</title>
<link>https://example.com/article</link>
<description>Test description</description>
</item>
</channel>
</rss>"""
mock_get.return_value = mock_response
articles = scraper.scrape_all()
assert len(articles) == 2 # 1 article from each source
@patch("src.scraper.requests.get")
def test_scraper_all_partial_failure(mock_get: Mock) -> None:
"""Test scrape_all continues on partial failures."""
config = ScraperConfig(
sources=["https://example.com/feed1", "https://example.com/feed2"],
max_articles=10,
timeout_seconds=10,
)
scraper = NewsScraper(config)
# First call succeeds, second fails
mock_success = Mock()
mock_success.ok = True
mock_success.raise_for_status = Mock()
mock_success.text = """<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test Article</title>
<link>https://example.com/article</link>
<description>Test description</description>
</item>
</channel>
</rss>"""
mock_get.side_effect = [mock_success, requests.Timeout("timeout")]
articles = scraper.scrape_all()
assert len(articles) == 1 # Only first source succeeded
@patch("src.scraper.requests.get")
def test_scraper_all_complete_failure(mock_get: Mock) -> None:
"""Test scrape_all raises when all sources fail."""
config = ScraperConfig(
sources=["https://example.com/feed1", "https://example.com/feed2"],
max_articles=10,
timeout_seconds=10,
)
scraper = NewsScraper(config)
mock_get.side_effect = requests.Timeout("timeout")
with pytest.raises(ScrapingError, match="Failed to scrape any articles"):
scraper.scrape_all()