links (link strategy)
+ brief_class — CSS class for brief record rows (brief strategy)
+ img_alt — truthy: extract titles from
attributes (rusneb strategy)
+ bold_text — truthy: extract author/title from … blocks (alib strategy)
"""
category = "archive_searchers"
@@ -51,30 +108,118 @@ class HtmlScraperPlugin:
self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
def search(self, query: str) -> list[CandidateRecord]:
+ """Search for books matching query.
+
+ Args:
+ query: Free-text search string (author, title, keywords).
+
+ Returns:
+ Up to three CandidateRecord dicts with source, title, author, year,
+ isbn, and publisher fields.
+ """
cfg = self.config
self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
- params: dict[str, Any] = dict(cfg.get("extra_params") or {})
- params[cfg["search_param"]] = query
- r = httpx.get(
- cfg["url"],
- params=params,
- timeout=self.timeout,
- headers={"User-Agent": "Mozilla/5.0"},
- )
- html = r.text
+
+ encoding = str(cfg.get("encoding") or "")
+ if encoding:
+ # Encode query and extra params in the site's native encoding.
+ q_enc = quote(query.encode(encoding, "replace"))
+ ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
+ ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
+ raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
+ r = httpx.get(
+ f'{cfg["url"]}?{raw_qs}',
+ timeout=self.timeout,
+ headers={"User-Agent": "Mozilla/5.0"},
+ )
+ html = r.content.decode(encoding, errors="replace")
+ else:
+ params: dict[str, Any] = dict(cfg.get("extra_params") or {})
+ params[cfg["search_param"]] = query
+ r = httpx.get(
+ cfg["url"],
+ params=params,
+ timeout=self.timeout,
+ headers={"User-Agent": "Mozilla/5.0"},
+ )
+ html = r.text
+
years = _YEAR_RE.findall(html)
- # Strategy: link_href_pattern (alib-style)
+ if cfg.get("bold_text"):
+ return self._parse_bold_text(html, years)
+ if cfg.get("img_alt"):
+ return self._parse_img_alt(html, years, cfg)
if "link_href_pattern" in cfg:
return self._parse_link(html, years, cfg)
-
- # Strategy: brief_class (shpl-style)
if "brief_class" in cfg:
return self._parse_brief(html, years, cfg)
-
- # Strategy: title_class + author_class (rusneb-style)
return self._parse_class(html, years, cfg)
+ def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
+ """Extract records from ``
text`` entries (Alib-style).
+
+ The bold text is expected to begin with ``Surname I.N. Title…``; the
+ author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
+
+ Args:
+ html: Decoded HTML response.
+ years: Year strings found in the full HTML (used positionally).
+
+ Returns:
+ Up to three CandidateRecord dicts.
+ """
+ entries = re.findall(r"
([^<]{5,200})", html)[:3]
+ out: list[CandidateRecord] = []
+ for i, entry in enumerate(entries):
+ text = entry.strip()
+ m = _AUTHOR_PREFIX_PAT.match(text)
+ if m:
+ author = m.group(1).strip()
+ title = m.group(2).strip()
+ else:
+ author = ""
+ title = text
+ out.append(
+ CandidateRecord(
+ source=self.plugin_id,
+ title=title,
+ author=author,
+ year=years[i] if i < len(years) else "",
+ isbn="",
+ publisher="",
+ )
+ )
+ return out
+
+ def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
+ """Extract records using ``
`` for titles and a CSS class for authors.
+
+ Used for sites like rusneb.ru where thumbnail alt attributes carry the
+ book title and a separate span contains the author.
+
+ Args:
+ html: Decoded HTML response.
+ years: Year strings found in the full HTML (used positionally).
+ cfg: Plugin config dict (reads ``author_class``).
+
+ Returns:
+ Up to three CandidateRecord dicts.
+ """
+ titles = _img_alts(html)
+ authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
+ return [
+ CandidateRecord(
+ source=self.plugin_id,
+ title=title,
+ author=authors[i] if i < len(authors) else "",
+ year=years[i] if i < len(years) else "",
+ isbn="",
+ publisher="",
+ )
+ for i, title in enumerate(titles)
+ ]
+
def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
diff --git a/src/plugins/archives/rsl.py b/src/plugins/archives/rsl.py
index 18edaab..2a2d5fd 100644
--- a/src/plugins/archives/rsl.py
+++ b/src/plugins/archives/rsl.py
@@ -1,5 +1,17 @@
-"""RSL (Russian State Library) AJAX JSON search API plugin (search.rsl.ru)."""
+"""RSL (Russian State Library) search plugin (search.rsl.ru).
+The search API requires a POST to ``/site/ajax-search?language=ru`` with
+form-encoded body containing ``SearchFilterForm[search]`` and a CSRF token
+obtained from the main search page. Query syntax is CQL:
+``title:(
) AND author:()``.
+
+Results come back as an HTML fragment in the ``content`` key of a JSON
+envelope; individual records are identified by the CSS classes
+``rsl-item-nocover-title`` (author) and ``rsl-item-nocover-descr`` (title).
+Both fields contain ```` highlight tags that are stripped before returning.
+"""
+
+import re
from typing import Any
import httpx
@@ -9,9 +21,27 @@ from models import CandidateRecord
from ..rate_limiter import RateLimiter
_DOMAIN = "search.rsl.ru"
+_SEARCH_URL = "https://search.rsl.ru/site/ajax-search"
+_BASE_URL = "https://search.rsl.ru/ru/search"
+_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
+
+
+def _strip_tags(html_frag: str) -> str:
+ """Strip HTML tags and decode basic entities from a fragment."""
+ text = re.sub(r"<[^>]+>", "", html_frag)
+ text = text.replace(""", '"').replace("&", "&").replace("<", "<").replace(">", ">")
+ return re.sub(r"\s+", " ", text).strip()
class RSLPlugin:
+ """Archive searcher for search.rsl.ru.
+
+ Formats the query as CQL ``title:(title_words) AND author:(author_word)``
+ by treating the first whitespace-delimited token as the author surname and
+ the remainder as title keywords. When only one token is present, a plain
+ ``title:(token) OR author:(token)`` query is used instead.
+ """
+
category = "archive_searchers"
def __init__(
@@ -32,28 +62,79 @@ class RSLPlugin:
self.timeout = timeout
def search(self, query: str) -> list[CandidateRecord]:
+ """Search RSL for books matching query.
+
+ Args:
+ query: Free-text string; the first token is treated as the author
+ surname and remaining tokens as title keywords.
+
+ Returns:
+ Up to three CandidateRecord dicts extracted from the RSL HTML
+ response, with ```` highlight tags stripped.
+ """
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
- r = httpx.get(
- "https://search.rsl.ru/site/ajax-search",
- params={"language": "ru", "q": query, "page": 1, "perPage": 5},
+
+ cql = self._build_cql(query)
+ client = httpx.Client()
+
+ # Fetch the main page to obtain a valid CSRF token.
+ r0 = client.get(_BASE_URL, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+ csrf_match = re.search(r'name="_csrf"\s+value="([^"]+)"', r0.text)
+ csrf = csrf_match.group(1) if csrf_match else ""
+
+ r = client.post(
+ _SEARCH_URL,
+ params={"language": "ru"},
+ data={"SearchFilterForm[search]": cql, "_csrf": csrf},
timeout=self.timeout,
- headers={"Accept": "application/json"},
+ headers={
+ "Accept": "application/json",
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer": _BASE_URL,
+ "User-Agent": "Mozilla/5.0",
+ },
)
data: dict[str, Any] = r.json()
- records: list[dict[str, Any]] = data.get("records") or data.get("items") or data.get("data") or []
+ content = str(data.get("content") or "")
+
+ raw_titles = re.findall(r'rsl-item-nocover-descr[^"]*">(.*?)', content)[:3]
+ raw_authors = re.findall(r'rsl-item-nocover-title[^"]*">(.*?)', content)[:3]
+ years = _YEAR_RE.findall(content)[:3]
+
out: list[CandidateRecord] = []
- for rec in records[:3]:
- title = (str(rec.get("title") or rec.get("name") or "")).strip()
+ for i, raw_title in enumerate(raw_titles):
+ title = _strip_tags(raw_title)
if not title:
continue
+ author = _strip_tags(raw_authors[i]) if i < len(raw_authors) else ""
out.append(
CandidateRecord(
source=self.plugin_id,
title=title,
- author=(str(rec.get("author") or rec.get("authors") or "")).strip(),
- year=str(rec.get("year") or rec.get("pubyear") or "").strip(),
- isbn=(str(rec.get("isbn") or "")).strip(),
- publisher=(str(rec.get("publisher") or "")).strip(),
+ author=author,
+ year=years[i] if i < len(years) else "",
+ isbn="",
+ publisher="",
)
)
return out
+
+ @staticmethod
+ def _build_cql(query: str) -> str:
+ """Build a CQL query string for the RSL search API.
+
+ Args:
+ query: Raw query string, typically ``"Author Title keywords"``.
+
+ Returns:
+ CQL string in the form ``title:(…) AND author:(…)`` when the query
+ contains multiple tokens, or ``title:(…) OR author:(…)`` for a
+ single token.
+ """
+ tokens = query.split()
+ if len(tokens) > 1:
+ author_part = tokens[0]
+ title_part = " ".join(tokens[1:])
+ return f"title:({title_part}) AND author:({author_part})"
+ token = tokens[0] if tokens else query
+ return f"title:({token}) OR author:({token})"
diff --git a/tests/test_archives.py b/tests/test_archives.py
new file mode 100644
index 0000000..c380ead
--- /dev/null
+++ b/tests/test_archives.py
@@ -0,0 +1,189 @@
+"""Network integration tests for archive searcher plugins.
+
+Each test queries a live external service for "War and Peace" by Tolstoy,
+a book universally catalogued in all supported archives.
+
+Run with: pytest tests/ -m network
+Skip with: pytest tests/ -m "not network" (default in presubmit)
+"""
+
+import pytest
+
+from models import CandidateRecord
+from plugins.archives.html_scraper import HtmlScraperPlugin
+from plugins.archives.openlibrary import OpenLibraryPlugin
+from plugins.archives.rsl import RSLPlugin
+from plugins.archives.sru_catalog import SRUCatalogPlugin
+from plugins.rate_limiter import RateLimiter
+
+pytestmark = pytest.mark.network
+
+_RL = RateLimiter()
+_TIMEOUT = 15
+
+
+def _titles(results: list[CandidateRecord]) -> list[str]:
+ return [r["title"] for r in results]
+
+
+def _authors(results: list[CandidateRecord]) -> list[str]:
+ return [r["author"] for r in results]
+
+
+def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
+ """Return True if any result title contains fragment (case-insensitive)."""
+ low = fragment.lower()
+ return any(low in r["title"].lower() for r in results)
+
+
+def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
+ """Return True if any result author contains fragment (case-insensitive)."""
+ low = fragment.lower()
+ return any(low in r["author"].lower() for r in results)
+
+
+# ── OpenLibrary ───────────────────────────────────────────────────────────────
+
+
+def test_openlibrary_war_and_peace() -> None:
+ plugin = OpenLibraryPlugin(
+ plugin_id="openlibrary",
+ name="OpenLibrary",
+ rate_limiter=_RL,
+ rate_limit_seconds=0,
+ auto_queue=True,
+ timeout=_TIMEOUT,
+ config={},
+ )
+ results = plugin.search("War and Peace Tolstoy")
+ assert results, "OpenLibrary returned no results"
+ assert all(r["source"] == "openlibrary" for r in results)
+ assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
+ # OpenLibrary stores authors in their original language; accept both forms.
+ assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
+
+
+# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
+
+
+def test_rsl_voina_i_mir() -> None:
+ plugin = RSLPlugin(
+ plugin_id="rsl",
+ name="РГБ",
+ rate_limiter=_RL,
+ rate_limit_seconds=0,
+ auto_queue=True,
+ timeout=_TIMEOUT,
+ config={},
+ )
+ results = plugin.search("Толстой Война и мир")
+ assert results, "RSL returned no results"
+ assert all(r["source"] == "rsl" for r in results)
+ assert _has_title(results, "война"), f"titles={_titles(results)}"
+
+
+# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
+
+
+def test_rusneb_voina_i_mir() -> None:
+ plugin = HtmlScraperPlugin(
+ plugin_id="rusneb",
+ name="НЭБ",
+ rate_limiter=_RL,
+ rate_limit_seconds=0,
+ auto_queue=True,
+ timeout=_TIMEOUT,
+ config={
+ "url": "https://rusneb.ru/search/",
+ "search_param": "q",
+ "img_alt": True,
+ "author_class": "search-list__item_subtext",
+ },
+ )
+ results = plugin.search("Война и мир Толстой")
+ assert results, "НЭБ returned no results"
+ assert all(r["source"] == "rusneb" for r in results)
+ assert _has_title(results, "война"), f"titles={_titles(results)}"
+ assert _has_author(results, "толст"), f"authors={_authors(results)}"
+
+
+# ── Alib ─────────────────────────────────────────────────────────────────────
+
+
+def test_alib_voina_i_mir() -> None:
+ plugin = HtmlScraperPlugin(
+ plugin_id="alib_web",
+ name="Alib (web)",
+ rate_limiter=_RL,
+ rate_limit_seconds=0,
+ auto_queue=False,
+ timeout=_TIMEOUT,
+ config={
+ "url": "https://www.alib.ru/find3.php4",
+ "search_param": "tfind",
+ "extra_params": {"f": "5", "s": "0"},
+ "encoding": "cp1251",
+ "bold_text": True,
+ },
+ )
+ results = plugin.search("Война и мир Толстой")
+ assert results, "Alib returned no results"
+ assert all(r["source"] == "alib_web" for r in results)
+ assert _has_title(results, "война"), f"titles={_titles(results)}"
+ assert _has_author(results, "толст"), f"authors={_authors(results)}"
+
+
+# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
+# The NLR SRU endpoint (www.nlr.ru/search/query) no longer exists (HTTP 404).
+
+
+@pytest.mark.xfail(reason="nlr.ru SRU endpoint no longer available (HTTP 404)", strict=False)
+def test_nlr_voina_i_mir() -> None:
+ plugin = SRUCatalogPlugin(
+ plugin_id="nlr",
+ name="НЛР",
+ rate_limiter=_RL,
+ rate_limit_seconds=0,
+ auto_queue=False,
+ timeout=_TIMEOUT,
+ config={
+ "url": "http://www.nlr.ru/search/query",
+ "query_prefix": "title=",
+ },
+ )
+ results = plugin.search("Война и мир")
+ assert results, "НЛР returned no results"
+ assert all(r["source"] == "nlr" for r in results)
+ assert _has_title(results, "война"), f"titles={_titles(results)}"
+
+
+# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
+# The ШПИЛ IRBIS64 CGI endpoint no longer exists (HTTP 404).
+
+
+@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
+def test_shpl_voina_i_mir() -> None:
+ plugin = HtmlScraperPlugin(
+ plugin_id="shpl",
+ name="ШПИЛ",
+ rate_limiter=_RL,
+ rate_limit_seconds=0,
+ auto_queue=False,
+ timeout=_TIMEOUT,
+ config={
+ "url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
+ "search_param": "S21ALL",
+ "extra_params": {
+ "C21COM": "S",
+ "I21DBN": "BIBL",
+ "P21DBN": "BIBL",
+ "S21FMT": "briefWebRus",
+ "Z21ID": "",
+ },
+ "brief_class": "brief",
+ },
+ )
+ results = plugin.search("Война и мир")
+ assert results, "ШПИЛ returned no results"
+ assert all(r["source"] == "shpl" for r in results)
+ assert _has_title(results, "война"), f"titles={_titles(results)}"