Complete Python implementation with strict type safety and best practices.
Features:
- RSS/Atom/HTML web scraping
- GPT-4 Vision image analysis
- Node.js API integration
- RSS/JSON feed publishing
Modules:
- src/config.py: Configuration with strict validation
- src/exceptions.py: Custom exception hierarchy
- src/scraper.py: Multi-format news scraping (RSS/Atom/HTML)
- src/image_analyzer.py: GPT-4 Vision integration with retry
- src/aggregator.py: Content aggregation and filtering
- src/article_client.py: Node.js API client with retry
- src/publisher.py: RSS/JSON feed generation
- scripts/run.py: Complete pipeline orchestrator
- scripts/validate.py: Code quality validation
Code Quality:
- 100% type hint coverage (mypy strict mode)
- Zero bare except clauses
- Logger throughout (no print statements)
- Comprehensive test suite (598 lines)
- Immutable dataclasses (frozen=True)
- Explicit error handling
- Structured logging
Stats:
- 1,431 lines of source code
- 598 lines of test code
- 15 Python files
- 8 core modules
- 4 test suites
All validation checks pass.
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
234 lines
6.7 KiB
Python
234 lines
6.7 KiB
Python
"""Tests for aggregator.py module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
|
|
import pytest
|
|
|
|
from src.aggregator import AggregatedContent, ContentAggregator
|
|
from src.image_analyzer import ImageAnalysis
|
|
from src.scraper import NewsArticle
|
|
|
|
|
|
def test_aggregated_content_creation() -> None:
|
|
"""Test AggregatedContent creation."""
|
|
article = NewsArticle(
|
|
title="Test",
|
|
url="https://example.com",
|
|
content="Content",
|
|
image_url="https://example.com/img.jpg",
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
analysis = ImageAnalysis(
|
|
image_url="https://example.com/img.jpg",
|
|
description="Test description",
|
|
confidence=0.9,
|
|
analysis_time=datetime.now(),
|
|
)
|
|
|
|
content = AggregatedContent(news=article, image_analysis=analysis)
|
|
|
|
assert content.news == article
|
|
assert content.image_analysis == analysis
|
|
|
|
|
|
def test_aggregated_content_to_prompt() -> None:
|
|
"""Test conversion to generation prompt."""
|
|
article = NewsArticle(
|
|
title="Test Title",
|
|
url="https://example.com",
|
|
content="Test Content",
|
|
image_url="https://example.com/img.jpg",
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
analysis = ImageAnalysis(
|
|
image_url="https://example.com/img.jpg",
|
|
description="Image description",
|
|
confidence=0.9,
|
|
analysis_time=datetime.now(),
|
|
)
|
|
|
|
content = AggregatedContent(news=article, image_analysis=analysis)
|
|
prompt = content.to_generation_prompt()
|
|
|
|
assert prompt["topic"] == "Test Title"
|
|
assert prompt["context"] == "Test Content"
|
|
assert prompt["image_description"] == "Image description"
|
|
|
|
|
|
def test_aggregated_content_to_prompt_no_image() -> None:
|
|
"""Test conversion to prompt without image."""
|
|
article = NewsArticle(
|
|
title="Test Title",
|
|
url="https://example.com",
|
|
content="Test Content",
|
|
image_url=None,
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
content = AggregatedContent(news=article, image_analysis=None)
|
|
prompt = content.to_generation_prompt()
|
|
|
|
assert prompt["topic"] == "Test Title"
|
|
assert prompt["context"] == "Test Content"
|
|
assert "image_description" not in prompt
|
|
|
|
|
|
def test_aggregator_initialization() -> None:
|
|
"""Test ContentAggregator initialization."""
|
|
aggregator = ContentAggregator(min_confidence=0.5)
|
|
assert aggregator._min_confidence == 0.5
|
|
|
|
|
|
def test_aggregator_invalid_confidence() -> None:
|
|
"""Test ContentAggregator rejects invalid confidence."""
|
|
with pytest.raises(ValueError, match="min_confidence must be between"):
|
|
ContentAggregator(min_confidence=1.5)
|
|
|
|
|
|
def test_aggregator_aggregate_with_matching_analysis() -> None:
|
|
"""Test aggregation with matching image analysis."""
|
|
aggregator = ContentAggregator(min_confidence=0.5)
|
|
|
|
article = NewsArticle(
|
|
title="Test",
|
|
url="https://example.com",
|
|
content="Content",
|
|
image_url="https://example.com/img.jpg",
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
analysis = ImageAnalysis(
|
|
image_url="https://example.com/img.jpg",
|
|
description="Description",
|
|
confidence=0.9,
|
|
analysis_time=datetime.now(),
|
|
)
|
|
|
|
aggregated = aggregator.aggregate([article], {"https://example.com/img.jpg": analysis})
|
|
|
|
assert len(aggregated) == 1
|
|
assert aggregated[0].news == article
|
|
assert aggregated[0].image_analysis == analysis
|
|
|
|
|
|
def test_aggregator_aggregate_low_confidence() -> None:
|
|
"""Test aggregation filters low-confidence analyses."""
|
|
aggregator = ContentAggregator(min_confidence=0.8)
|
|
|
|
article = NewsArticle(
|
|
title="Test",
|
|
url="https://example.com",
|
|
content="Content",
|
|
image_url="https://example.com/img.jpg",
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
analysis = ImageAnalysis(
|
|
image_url="https://example.com/img.jpg",
|
|
description="Description",
|
|
confidence=0.5, # Below threshold
|
|
analysis_time=datetime.now(),
|
|
)
|
|
|
|
aggregated = aggregator.aggregate([article], {"https://example.com/img.jpg": analysis})
|
|
|
|
assert len(aggregated) == 1
|
|
assert aggregated[0].image_analysis is None # Filtered out
|
|
|
|
|
|
def test_aggregator_aggregate_no_image() -> None:
|
|
"""Test aggregation with articles without images."""
|
|
aggregator = ContentAggregator()
|
|
|
|
article = NewsArticle(
|
|
title="Test",
|
|
url="https://example.com",
|
|
content="Content",
|
|
image_url=None,
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
aggregated = aggregator.aggregate([article], {})
|
|
|
|
assert len(aggregated) == 1
|
|
assert aggregated[0].image_analysis is None
|
|
|
|
|
|
def test_aggregator_aggregate_empty_articles() -> None:
|
|
"""Test aggregation fails with empty articles list."""
|
|
aggregator = ContentAggregator()
|
|
|
|
with pytest.raises(ValueError, match="At least one article is required"):
|
|
aggregator.aggregate([], {})
|
|
|
|
|
|
def test_aggregator_filter_by_image_required() -> None:
|
|
"""Test filtering to keep only items with images."""
|
|
aggregator = ContentAggregator()
|
|
|
|
article1 = NewsArticle(
|
|
title="Test1",
|
|
url="https://example.com/1",
|
|
content="Content1",
|
|
image_url="https://example.com/img1.jpg",
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
article2 = NewsArticle(
|
|
title="Test2",
|
|
url="https://example.com/2",
|
|
content="Content2",
|
|
image_url=None,
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
analysis = ImageAnalysis(
|
|
image_url="https://example.com/img1.jpg",
|
|
description="Description",
|
|
confidence=0.9,
|
|
analysis_time=datetime.now(),
|
|
)
|
|
|
|
content1 = AggregatedContent(news=article1, image_analysis=analysis)
|
|
content2 = AggregatedContent(news=article2, image_analysis=None)
|
|
|
|
filtered = aggregator.filter_by_image_required([content1, content2])
|
|
|
|
assert len(filtered) == 1
|
|
assert filtered[0].image_analysis is not None
|
|
|
|
|
|
def test_aggregator_limit_content_length() -> None:
|
|
"""Test content length limiting."""
|
|
aggregator = ContentAggregator()
|
|
|
|
long_content = "A" * 1000
|
|
article = NewsArticle(
|
|
title="Test",
|
|
url="https://example.com",
|
|
content=long_content,
|
|
image_url=None,
|
|
published_at=None,
|
|
source="https://example.com",
|
|
)
|
|
|
|
content = AggregatedContent(news=article, image_analysis=None)
|
|
|
|
truncated = aggregator.limit_content_length([content], max_length=100)
|
|
|
|
assert len(truncated) == 1
|
|
assert len(truncated[0].news.content) == 103 # 100 + "..."
|
|
assert truncated[0].news.content.endswith("...")
|