feedgenerator/tests/test_aggregator.py
StillHammer 40138c2d45 Initial implementation: Feed Generator V1
Complete Python implementation with strict type safety and best practices.

Features:
- RSS/Atom/HTML web scraping
- GPT-4 Vision image analysis
- Node.js API integration
- RSS/JSON feed publishing

Modules:
- src/config.py: Configuration with strict validation
- src/exceptions.py: Custom exception hierarchy
- src/scraper.py: Multi-format news scraping (RSS/Atom/HTML)
- src/image_analyzer.py: GPT-4 Vision integration with retry
- src/aggregator.py: Content aggregation and filtering
- src/article_client.py: Node.js API client with retry
- src/publisher.py: RSS/JSON feed generation
- scripts/run.py: Complete pipeline orchestrator
- scripts/validate.py: Code quality validation

Code Quality:
- 100% type hint coverage (mypy strict mode)
- Zero bare except clauses
- Logger throughout (no print statements)
- Comprehensive test suite (598 lines)
- Immutable dataclasses (frozen=True)
- Explicit error handling
- Structured logging

Stats:
- 1,431 lines of source code
- 598 lines of test code
- 15 Python files
- 8 core modules
- 4 test suites

All validation checks pass.

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-07 22:28:18 +08:00

234 lines
6.7 KiB
Python

"""Tests for aggregator.py module."""
from __future__ import annotations
from datetime import datetime
import pytest
from src.aggregator import AggregatedContent, ContentAggregator
from src.image_analyzer import ImageAnalysis
from src.scraper import NewsArticle
def test_aggregated_content_creation() -> None:
"""Test AggregatedContent creation."""
article = NewsArticle(
title="Test",
url="https://example.com",
content="Content",
image_url="https://example.com/img.jpg",
published_at=None,
source="https://example.com",
)
analysis = ImageAnalysis(
image_url="https://example.com/img.jpg",
description="Test description",
confidence=0.9,
analysis_time=datetime.now(),
)
content = AggregatedContent(news=article, image_analysis=analysis)
assert content.news == article
assert content.image_analysis == analysis
def test_aggregated_content_to_prompt() -> None:
"""Test conversion to generation prompt."""
article = NewsArticle(
title="Test Title",
url="https://example.com",
content="Test Content",
image_url="https://example.com/img.jpg",
published_at=None,
source="https://example.com",
)
analysis = ImageAnalysis(
image_url="https://example.com/img.jpg",
description="Image description",
confidence=0.9,
analysis_time=datetime.now(),
)
content = AggregatedContent(news=article, image_analysis=analysis)
prompt = content.to_generation_prompt()
assert prompt["topic"] == "Test Title"
assert prompt["context"] == "Test Content"
assert prompt["image_description"] == "Image description"
def test_aggregated_content_to_prompt_no_image() -> None:
"""Test conversion to prompt without image."""
article = NewsArticle(
title="Test Title",
url="https://example.com",
content="Test Content",
image_url=None,
published_at=None,
source="https://example.com",
)
content = AggregatedContent(news=article, image_analysis=None)
prompt = content.to_generation_prompt()
assert prompt["topic"] == "Test Title"
assert prompt["context"] == "Test Content"
assert "image_description" not in prompt
def test_aggregator_initialization() -> None:
"""Test ContentAggregator initialization."""
aggregator = ContentAggregator(min_confidence=0.5)
assert aggregator._min_confidence == 0.5
def test_aggregator_invalid_confidence() -> None:
"""Test ContentAggregator rejects invalid confidence."""
with pytest.raises(ValueError, match="min_confidence must be between"):
ContentAggregator(min_confidence=1.5)
def test_aggregator_aggregate_with_matching_analysis() -> None:
"""Test aggregation with matching image analysis."""
aggregator = ContentAggregator(min_confidence=0.5)
article = NewsArticle(
title="Test",
url="https://example.com",
content="Content",
image_url="https://example.com/img.jpg",
published_at=None,
source="https://example.com",
)
analysis = ImageAnalysis(
image_url="https://example.com/img.jpg",
description="Description",
confidence=0.9,
analysis_time=datetime.now(),
)
aggregated = aggregator.aggregate([article], {"https://example.com/img.jpg": analysis})
assert len(aggregated) == 1
assert aggregated[0].news == article
assert aggregated[0].image_analysis == analysis
def test_aggregator_aggregate_low_confidence() -> None:
"""Test aggregation filters low-confidence analyses."""
aggregator = ContentAggregator(min_confidence=0.8)
article = NewsArticle(
title="Test",
url="https://example.com",
content="Content",
image_url="https://example.com/img.jpg",
published_at=None,
source="https://example.com",
)
analysis = ImageAnalysis(
image_url="https://example.com/img.jpg",
description="Description",
confidence=0.5, # Below threshold
analysis_time=datetime.now(),
)
aggregated = aggregator.aggregate([article], {"https://example.com/img.jpg": analysis})
assert len(aggregated) == 1
assert aggregated[0].image_analysis is None # Filtered out
def test_aggregator_aggregate_no_image() -> None:
"""Test aggregation with articles without images."""
aggregator = ContentAggregator()
article = NewsArticle(
title="Test",
url="https://example.com",
content="Content",
image_url=None,
published_at=None,
source="https://example.com",
)
aggregated = aggregator.aggregate([article], {})
assert len(aggregated) == 1
assert aggregated[0].image_analysis is None
def test_aggregator_aggregate_empty_articles() -> None:
"""Test aggregation fails with empty articles list."""
aggregator = ContentAggregator()
with pytest.raises(ValueError, match="At least one article is required"):
aggregator.aggregate([], {})
def test_aggregator_filter_by_image_required() -> None:
"""Test filtering to keep only items with images."""
aggregator = ContentAggregator()
article1 = NewsArticle(
title="Test1",
url="https://example.com/1",
content="Content1",
image_url="https://example.com/img1.jpg",
published_at=None,
source="https://example.com",
)
article2 = NewsArticle(
title="Test2",
url="https://example.com/2",
content="Content2",
image_url=None,
published_at=None,
source="https://example.com",
)
analysis = ImageAnalysis(
image_url="https://example.com/img1.jpg",
description="Description",
confidence=0.9,
analysis_time=datetime.now(),
)
content1 = AggregatedContent(news=article1, image_analysis=analysis)
content2 = AggregatedContent(news=article2, image_analysis=None)
filtered = aggregator.filter_by_image_required([content1, content2])
assert len(filtered) == 1
assert filtered[0].image_analysis is not None
def test_aggregator_limit_content_length() -> None:
"""Test content length limiting."""
aggregator = ContentAggregator()
long_content = "A" * 1000
article = NewsArticle(
title="Test",
url="https://example.com",
content=long_content,
image_url=None,
published_at=None,
source="https://example.com",
)
content = AggregatedContent(news=article, image_analysis=None)
truncated = aggregator.limit_content_length([content], max_length=100)
assert len(truncated) == 1
assert len(truncated[0].news.content) == 103 # 100 + "..."
assert truncated[0].news.content.endswith("...")