"""Tests for scraper.py module."""
from __future__ import annotations
from datetime import datetime
from unittest.mock import Mock, patch
import pytest
import requests
from src.exceptions import ScrapingError
from src.scraper import NewsArticle, NewsScraper, ScraperConfig
def test_news_article_creation() -> None:
"""Test NewsArticle creation with valid data."""
article = NewsArticle(
title="Test Article",
url="https://example.com/article",
content="Test content",
image_url="https://example.com/image.jpg",
published_at=datetime.now(),
source="https://example.com",
)
assert article.title == "Test Article"
assert article.url == "https://example.com/article"
assert article.content == "Test content"
def test_news_article_validation_empty_title() -> None:
"""Test NewsArticle validation fails with empty title."""
with pytest.raises(ValueError, match="Title cannot be empty"):
NewsArticle(
title="",
url="https://example.com/article",
content="Test content",
image_url=None,
published_at=None,
source="https://example.com",
)
def test_news_article_validation_invalid_url() -> None:
"""Test NewsArticle validation fails with invalid URL."""
with pytest.raises(ValueError, match="Invalid URL"):
NewsArticle(
title="Test",
url="not-a-url",
content="Test content",
image_url=None,
published_at=None,
source="https://example.com",
)
def test_scraper_config_validation() -> None:
"""Test NewsScraper validates configuration."""
config = ScraperConfig(sources=[], max_articles=10, timeout_seconds=10)
with pytest.raises(ValueError, match="At least one source is required"):
NewsScraper(config)
def test_scraper_initialization() -> None:
"""Test NewsScraper initialization with valid config."""
config = ScraperConfig(
sources=["https://example.com"], max_articles=10, timeout_seconds=10
)
scraper = NewsScraper(config)
assert scraper._config == config
@patch("src.scraper.requests.get")
def test_scraper_success(mock_get: Mock) -> None:
"""Test successful scraping."""
config = ScraperConfig(
sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10
)
scraper = NewsScraper(config)
# Mock RSS response
mock_response = Mock()
mock_response.ok = True
mock_response.raise_for_status = Mock()
mock_response.text = """
-
Test Article
https://example.com/article1
Test description
"""
mock_get.return_value = mock_response
articles = scraper.scrape("https://example.com/feed")
assert len(articles) == 1
assert articles[0].title == "Test Article"
assert articles[0].url == "https://example.com/article1"
@patch("src.scraper.requests.get")
def test_scraper_timeout(mock_get: Mock) -> None:
"""Test scraping handles timeout."""
config = ScraperConfig(
sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10
)
scraper = NewsScraper(config)
mock_get.side_effect = requests.Timeout("Connection timeout")
with pytest.raises(ScrapingError, match="Timeout scraping"):
scraper.scrape("https://example.com/feed")
@patch("src.scraper.requests.get")
def test_scraper_request_exception(mock_get: Mock) -> None:
"""Test scraping handles request exceptions."""
config = ScraperConfig(
sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10
)
scraper = NewsScraper(config)
mock_get.side_effect = requests.RequestException("Connection error")
with pytest.raises(ScrapingError, match="Failed to scrape"):
scraper.scrape("https://example.com/feed")
@patch("src.scraper.requests.get")
def test_scraper_all_success(mock_get: Mock) -> None:
"""Test scrape_all with multiple sources."""
config = ScraperConfig(
sources=["https://example.com/feed1", "https://example.com/feed2"],
max_articles=10,
timeout_seconds=10,
)
scraper = NewsScraper(config)
mock_response = Mock()
mock_response.ok = True
mock_response.raise_for_status = Mock()
mock_response.text = """
-
Test Article
https://example.com/article
Test description
"""
mock_get.return_value = mock_response
articles = scraper.scrape_all()
assert len(articles) == 2 # 1 article from each source
@patch("src.scraper.requests.get")
def test_scraper_all_partial_failure(mock_get: Mock) -> None:
"""Test scrape_all continues on partial failures."""
config = ScraperConfig(
sources=["https://example.com/feed1", "https://example.com/feed2"],
max_articles=10,
timeout_seconds=10,
)
scraper = NewsScraper(config)
# First call succeeds, second fails
mock_success = Mock()
mock_success.ok = True
mock_success.raise_for_status = Mock()
mock_success.text = """
-
Test Article
https://example.com/article
Test description
"""
mock_get.side_effect = [mock_success, requests.Timeout("timeout")]
articles = scraper.scrape_all()
assert len(articles) == 1 # Only first source succeeded
@patch("src.scraper.requests.get")
def test_scraper_all_complete_failure(mock_get: Mock) -> None:
"""Test scrape_all raises when all sources fail."""
config = ScraperConfig(
sources=["https://example.com/feed1", "https://example.com/feed2"],
max_articles=10,
timeout_seconds=10,
)
scraper = NewsScraper(config)
mock_get.side_effect = requests.Timeout("timeout")
with pytest.raises(ScrapingError, match="Failed to scrape any articles"):
scraper.scrape_all()