"""Tests for scraper.py module.""" from __future__ import annotations from datetime import datetime from unittest.mock import Mock, patch import pytest import requests from src.exceptions import ScrapingError from src.scraper import NewsArticle, NewsScraper, ScraperConfig def test_news_article_creation() -> None: """Test NewsArticle creation with valid data.""" article = NewsArticle( title="Test Article", url="https://example.com/article", content="Test content", image_url="https://example.com/image.jpg", published_at=datetime.now(), source="https://example.com", ) assert article.title == "Test Article" assert article.url == "https://example.com/article" assert article.content == "Test content" def test_news_article_validation_empty_title() -> None: """Test NewsArticle validation fails with empty title.""" with pytest.raises(ValueError, match="Title cannot be empty"): NewsArticle( title="", url="https://example.com/article", content="Test content", image_url=None, published_at=None, source="https://example.com", ) def test_news_article_validation_invalid_url() -> None: """Test NewsArticle validation fails with invalid URL.""" with pytest.raises(ValueError, match="Invalid URL"): NewsArticle( title="Test", url="not-a-url", content="Test content", image_url=None, published_at=None, source="https://example.com", ) def test_scraper_config_validation() -> None: """Test NewsScraper validates configuration.""" config = ScraperConfig(sources=[], max_articles=10, timeout_seconds=10) with pytest.raises(ValueError, match="At least one source is required"): NewsScraper(config) def test_scraper_initialization() -> None: """Test NewsScraper initialization with valid config.""" config = ScraperConfig( sources=["https://example.com"], max_articles=10, timeout_seconds=10 ) scraper = NewsScraper(config) assert scraper._config == config @patch("src.scraper.requests.get") def test_scraper_success(mock_get: Mock) -> None: """Test successful scraping.""" config = ScraperConfig( sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10 ) scraper = NewsScraper(config) # Mock RSS response mock_response = Mock() mock_response.ok = True mock_response.raise_for_status = Mock() mock_response.text = """ Test Article https://example.com/article1 Test description """ mock_get.return_value = mock_response articles = scraper.scrape("https://example.com/feed") assert len(articles) == 1 assert articles[0].title == "Test Article" assert articles[0].url == "https://example.com/article1" @patch("src.scraper.requests.get") def test_scraper_timeout(mock_get: Mock) -> None: """Test scraping handles timeout.""" config = ScraperConfig( sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10 ) scraper = NewsScraper(config) mock_get.side_effect = requests.Timeout("Connection timeout") with pytest.raises(ScrapingError, match="Timeout scraping"): scraper.scrape("https://example.com/feed") @patch("src.scraper.requests.get") def test_scraper_request_exception(mock_get: Mock) -> None: """Test scraping handles request exceptions.""" config = ScraperConfig( sources=["https://example.com/feed"], max_articles=10, timeout_seconds=10 ) scraper = NewsScraper(config) mock_get.side_effect = requests.RequestException("Connection error") with pytest.raises(ScrapingError, match="Failed to scrape"): scraper.scrape("https://example.com/feed") @patch("src.scraper.requests.get") def test_scraper_all_success(mock_get: Mock) -> None: """Test scrape_all with multiple sources.""" config = ScraperConfig( sources=["https://example.com/feed1", "https://example.com/feed2"], max_articles=10, timeout_seconds=10, ) scraper = NewsScraper(config) mock_response = Mock() mock_response.ok = True mock_response.raise_for_status = Mock() mock_response.text = """ Test Article https://example.com/article Test description """ mock_get.return_value = mock_response articles = scraper.scrape_all() assert len(articles) == 2 # 1 article from each source @patch("src.scraper.requests.get") def test_scraper_all_partial_failure(mock_get: Mock) -> None: """Test scrape_all continues on partial failures.""" config = ScraperConfig( sources=["https://example.com/feed1", "https://example.com/feed2"], max_articles=10, timeout_seconds=10, ) scraper = NewsScraper(config) # First call succeeds, second fails mock_success = Mock() mock_success.ok = True mock_success.raise_for_status = Mock() mock_success.text = """ Test Article https://example.com/article Test description """ mock_get.side_effect = [mock_success, requests.Timeout("timeout")] articles = scraper.scrape_all() assert len(articles) == 1 # Only first source succeeded @patch("src.scraper.requests.get") def test_scraper_all_complete_failure(mock_get: Mock) -> None: """Test scrape_all raises when all sources fail.""" config = ScraperConfig( sources=["https://example.com/feed1", "https://example.com/feed2"], max_articles=10, timeout_seconds=10, ) scraper = NewsScraper(config) mock_get.side_effect = requests.Timeout("timeout") with pytest.raises(ScrapingError, match="Failed to scrape any articles"): scraper.scrape_all()