Complete Python implementation with strict type safety and best practices.
Features:
- RSS/Atom/HTML web scraping
- GPT-4 Vision image analysis
- Node.js API integration
- RSS/JSON feed publishing
Modules:
- src/config.py: Configuration with strict validation
- src/exceptions.py: Custom exception hierarchy
- src/scraper.py: Multi-format news scraping (RSS/Atom/HTML)
- src/image_analyzer.py: GPT-4 Vision integration with retry
- src/aggregator.py: Content aggregation and filtering
- src/article_client.py: Node.js API client with retry
- src/publisher.py: RSS/JSON feed generation
- scripts/run.py: Complete pipeline orchestrator
- scripts/validate.py: Code quality validation
Code Quality:
- 100% type hint coverage (mypy strict mode)
- Zero bare except clauses
- Logger throughout (no print statements)
- Comprehensive test suite (598 lines)
- Immutable dataclasses (frozen=True)
- Explicit error handling
- Structured logging
Stats:
- 1,431 lines of source code
- 598 lines of test code
- 15 Python files
- 8 core modules
- 4 test suites
All validation checks pass.
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
210 lines
6.3 KiB
Python
210 lines
6.3 KiB
Python
"""Tests for scraper.py module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
from unittest.mock import Mock, patch
|
|
|
|
import pytest
|
|
import requests
|
|
|
|
from src.exceptions import ScrapingError
|
|
from src.scraper import NewsArticle, NewsScraper, ScraperConfig
|
|
|
|
|
|
def test_news_article_creation() -> None:
|
|
"""Test NewsArticle creation with valid data."""
|
|
article = NewsArticle(
|
|
title="Test Article",
|
|
url="https://example.com/article",
|
|
content="Test content",
|
|
image_url="https://example.com/image.jpg",
|
|
published_at=datetime.now(),
|
|
source="https://example.com",
|
|
)
|
|
|
|
assert article.title == "Test Article"
|
|
assert article.url == "https://example.com/article"
|
|
assert article.content == "Test content"
|
|
|
|
|
|
def test_news_article_validation_empty_title() -> None:
|
|
"""Test NewsArticle validation fails with empty title."""
|
|
with pytest.raises(ValueError, match="Title cannot be empty"):
|
|
NewsArticle(
|
|
title="",
|
|
url="https://example.com/article",
|
|
content="Test content",
|
|
image_url=None,
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
|
|
def test_news_article_validation_invalid_url() -> None:
|
|
"""Test NewsArticle validation fails with invalid URL."""
|
|
with pytest.raises(ValueError, match="Invalid URL"):
|
|
NewsArticle(
|
|
title="Test",
|
|
url="not-a-url",
|
|
content="Test content",
|
|
image_url=None,
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
|
|
def test_scraper_config_validation() -> None:
|
|
"""Test NewsScraper validates configuration."""
|
|
config = ScraperConfig(sources=[], max_articles=10, timeout_seconds=10)
|
|
|
|
with pytest.raises(ValueError, match="At least one source is required"):
|
|
NewsScraper(config)
|
|
|
|
|
|
def test_scraper_initialization() -> None:
|
|
"""Test NewsScraper initialization with valid config."""
|
|
config = ScraperConfig(
|
|
sources=["https://example.com"], max_articles=10, timeout_seconds=10
|
|
)
|
|
scraper = NewsScraper(config)
|
|
|
|
assert scraper._config == config
|
|
|
|
|
|
@patch("src.scraper.requests.get")
|
|
def test_scraper_success(mock_get: Mock) -> None:
|
|
"""Test successful scraping."""
|
|
config = ScraperConfig(
|
|
sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10
|
|
)
|
|
scraper = NewsScraper(config)
|
|
|
|
# Mock RSS response
|
|
mock_response = Mock()
|
|
mock_response.ok = True
|
|
mock_response.raise_for_status = Mock()
|
|
mock_response.text = """<?xml version="1.0"?>
|
|
<rss version="2.0">
|
|
<channel>
|
|
<item>
|
|
<title>Test Article</title>
|
|
<link>https://example.com/article1</link>
|
|
<description>Test description</description>
|
|
</item>
|
|
</channel>
|
|
</rss>"""
|
|
mock_get.return_value = mock_response
|
|
|
|
articles = scraper.scrape("https://example.com/feed")
|
|
|
|
assert len(articles) == 1
|
|
assert articles[0].title == "Test Article"
|
|
assert articles[0].url == "https://example.com/article1"
|
|
|
|
|
|
@patch("src.scraper.requests.get")
|
|
def test_scraper_timeout(mock_get: Mock) -> None:
|
|
"""Test scraping handles timeout."""
|
|
config = ScraperConfig(
|
|
sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10
|
|
)
|
|
scraper = NewsScraper(config)
|
|
|
|
mock_get.side_effect = requests.Timeout("Connection timeout")
|
|
|
|
with pytest.raises(ScrapingError, match="Timeout scraping"):
|
|
scraper.scrape("https://example.com/feed")
|
|
|
|
|
|
@patch("src.scraper.requests.get")
|
|
def test_scraper_request_exception(mock_get: Mock) -> None:
|
|
"""Test scraping handles request exceptions."""
|
|
config = ScraperConfig(
|
|
sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10
|
|
)
|
|
scraper = NewsScraper(config)
|
|
|
|
mock_get.side_effect = requests.RequestException("Connection error")
|
|
|
|
with pytest.raises(ScrapingError, match="Failed to scrape"):
|
|
scraper.scrape("https://example.com/feed")
|
|
|
|
|
|
@patch("src.scraper.requests.get")
|
|
def test_scraper_all_success(mock_get: Mock) -> None:
|
|
"""Test scrape_all with multiple sources."""
|
|
config = ScraperConfig(
|
|
sources=["https://example.com/feed1", "https://example.com/feed2"],
|
|
max_articles=10,
|
|
timeout_seconds=10,
|
|
)
|
|
scraper = NewsScraper(config)
|
|
|
|
mock_response = Mock()
|
|
mock_response.ok = True
|
|
mock_response.raise_for_status = Mock()
|
|
mock_response.text = """<?xml version="1.0"?>
|
|
<rss version="2.0">
|
|
<channel>
|
|
<item>
|
|
<title>Test Article</title>
|
|
<link>https://example.com/article</link>
|
|
<description>Test description</description>
|
|
</item>
|
|
</channel>
|
|
</rss>"""
|
|
mock_get.return_value = mock_response
|
|
|
|
articles = scraper.scrape_all()
|
|
|
|
assert len(articles) == 2 # 1 article from each source
|
|
|
|
|
|
@patch("src.scraper.requests.get")
|
|
def test_scraper_all_partial_failure(mock_get: Mock) -> None:
|
|
"""Test scrape_all continues on partial failures."""
|
|
config = ScraperConfig(
|
|
sources=["https://example.com/feed1", "https://example.com/feed2"],
|
|
max_articles=10,
|
|
timeout_seconds=10,
|
|
)
|
|
scraper = NewsScraper(config)
|
|
|
|
# First call succeeds, second fails
|
|
mock_success = Mock()
|
|
mock_success.ok = True
|
|
mock_success.raise_for_status = Mock()
|
|
mock_success.text = """<?xml version="1.0"?>
|
|
<rss version="2.0">
|
|
<channel>
|
|
<item>
|
|
<title>Test Article</title>
|
|
<link>https://example.com/article</link>
|
|
<description>Test description</description>
|
|
</item>
|
|
</channel>
|
|
</rss>"""
|
|
|
|
mock_get.side_effect = [mock_success, requests.Timeout("timeout")]
|
|
|
|
articles = scraper.scrape_all()
|
|
|
|
assert len(articles) == 1 # Only first source succeeded
|
|
|
|
|
|
@patch("src.scraper.requests.get")
|
|
def test_scraper_all_complete_failure(mock_get: Mock) -> None:
|
|
"""Test scrape_all raises when all sources fail."""
|
|
config = ScraperConfig(
|
|
sources=["https://example.com/feed1", "https://example.com/feed2"],
|
|
max_articles=10,
|
|
timeout_seconds=10,
|
|
)
|
|
scraper = NewsScraper(config)
|
|
|
|
mock_get.side_effect = requests.Timeout("timeout")
|
|
|
|
with pytest.raises(ScrapingError, match="Failed to scrape any articles"):
|
|
scraper.scrape_all()
|