Files
bookshelf/tests/test_archives.py
Petr Polezhaev fd32be729f Replace config-driven HtmlScraperPlugin with specific archive classes
Each archive scraper now has its own class with hardcoded URL and parsing
logic; config only carries auto_queue, timeout, and rate_limit_seconds.

- html_scraper: refactor to base class with public shared utilities
  (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts)
- rusneb.py (new): RusnebPlugin extracts year per list item rather than
  globally, eliminating wrong page-level dates
- alib.py (new): AlibPlugin extracts year from within each <p><b> entry
  rather than globally, fixing nonsensical year values
- shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded
  params; config type updated from html_scraper to shpl
- config: remove config: subsections from rusneb, alib_web, shpl entries;
  update type fields to rusneb, alib_web, shpl respectively
- plugins/__init__.py: register new specific types, remove html_scraper
- tests: use specific plugin classes; assert all CandidateRecord fields
  (source, title, author, year, isbn, publisher) with appropriate constraints

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 00:03:17 +03:00

203 lines
7.9 KiB
Python

"""Network integration tests for archive searcher plugins.
Each test queries a live external service for "War and Peace" by Tolstoy,
a book universally catalogued in all supported archives.
Run with: pytest tests/ -m network
Skip with: pytest tests/ -m "not network" (default in presubmit)
"""
import re
import pytest
from models import CandidateRecord
from plugins.archives.alib import AlibPlugin
from plugins.archives.openlibrary import OpenLibraryPlugin
from plugins.archives.rsl import RSLPlugin
from plugins.archives.rusneb import RusnebPlugin
from plugins.archives.shpl import ShplPlugin
from plugins.archives.sru_catalog import SRUCatalogPlugin
from plugins.rate_limiter import RateLimiter
pytestmark = pytest.mark.network
_RL = RateLimiter()
_TIMEOUT = 15
_YEAR_PAT = re.compile(r"^\d{4}$")
def _titles(results: list[CandidateRecord]) -> list[str]:
return [r["title"] for r in results]
def _authors(results: list[CandidateRecord]) -> list[str]:
return [r["author"] for r in results]
def _years(results: list[CandidateRecord]) -> list[str]:
return [r["year"] for r in results]
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
"""Return True if any result title contains fragment (case-insensitive)."""
low = fragment.lower()
return any(low in r["title"].lower() for r in results)
def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
"""Return True if any result author contains fragment (case-insensitive)."""
low = fragment.lower()
return any(low in r["author"].lower() for r in results)
def _valid_year(year: str) -> bool:
"""Return True if year is a 4-digit string or empty."""
return year == "" or bool(_YEAR_PAT.match(year))
# ── OpenLibrary ───────────────────────────────────────────────────────────────
def test_openlibrary_war_and_peace() -> None:
plugin = OpenLibraryPlugin(
plugin_id="openlibrary",
name="OpenLibrary",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=True,
timeout=_TIMEOUT,
config={},
)
results = plugin.search("War and Peace Tolstoy")
assert results, "OpenLibrary returned no results"
assert all(r["source"] == "openlibrary" for r in results)
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
# OpenLibrary stores authors in their original language; accept both forms.
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
# OpenLibrary returns isbn and publisher from its JSON API.
assert all(isinstance(r["isbn"], str) for r in results)
assert all(isinstance(r["publisher"], str) for r in results)
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
def test_rsl_voina_i_mir() -> None:
plugin = RSLPlugin(
plugin_id="rsl",
name="РГБ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=True,
timeout=_TIMEOUT,
config={},
)
results = plugin.search("Толстой Война и мир")
assert results, "RSL returned no results"
assert all(r["source"] == "rsl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
def test_rusneb_voina_i_mir() -> None:
plugin = RusnebPlugin(
plugin_id="rusneb",
name="НЭБ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=True,
timeout=_TIMEOUT,
config={},
)
results = plugin.search("Война и мир Толстой")
assert results, "НЭБ returned no results"
assert all(r["source"] == "rusneb" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── Alib ─────────────────────────────────────────────────────────────────────
def test_alib_voina_i_mir() -> None:
plugin = AlibPlugin(
plugin_id="alib_web",
name="Alib (web)",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={},
)
results = plugin.search("Война и мир Толстой")
assert results, "Alib returned no results"
assert all(r["source"] == "alib_web" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
# Alib entries always include a publication year in the bibliographic text.
assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
# The NLR SRU endpoint (www.nlr.ru/search/query) no longer exists (HTTP 404).
@pytest.mark.xfail(reason="nlr.ru SRU endpoint no longer available (HTTP 404)", strict=False)
def test_nlr_voina_i_mir() -> None:
plugin = SRUCatalogPlugin(
plugin_id="nlr",
name="НЛР",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={
"url": "http://www.nlr.ru/search/query",
"query_prefix": "title=",
},
)
results = plugin.search("Война и мир")
assert results, "НЛР returned no results"
assert all(r["source"] == "nlr" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
# The ШПИЛ IRBIS64 CGI endpoint no longer exists (HTTP 404).
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
def test_shpl_voina_i_mir() -> None:
plugin = ShplPlugin(
plugin_id="shpl",
name="ШПИЛ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
config={},
)
results = plugin.search("Война и мир")
assert results, "ШПИЛ returned no results"
assert all(r["source"] == "shpl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
assert all(r["isbn"] == "" for r in results)
assert all(r["publisher"] == "" for r in results)