From fd32be729f531eba12fc36f3338aac1eddd22929 Mon Sep 17 00:00:00 2001
From: Petr Polezhaev <petr.polezhaev@ratigorsk-12.ru>
Date: Tue, 10 Mar 2026 00:03:17 +0300
Subject: [PATCH] Replace config-driven HtmlScraperPlugin with specific archive
 classes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each archive scraper now has its own class with hardcoded URL and parsing
logic; config only carries auto_queue, timeout, and rate_limit_seconds.

- html_scraper: refactor to base class with public shared utilities
  (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts)
- rusneb.py (new): RusnebPlugin extracts year per list item rather than
  globally, eliminating wrong page-level dates
- alib.py (new): AlibPlugin extracts year from within each <p><b> entry
  rather than globally, fixing nonsensical year values
- shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded
  params; config type updated from html_scraper to shpl
- config: remove config: subsections from rusneb, alib_web, shpl entries;
  update type fields to rusneb, alib_web, shpl respectively
- plugins/__init__.py: register new specific types, remove html_scraper
- tests: use specific plugin classes; assert all CandidateRecord fields
  (source, title, author, year, isbn, publisher) with appropriate constraints

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 config/functions.default.yaml        |  23 +---
 src/plugins/__init__.py              |   8 +-
 src/plugins/archives/alib.py         |  70 ++++++++++
 src/plugins/archives/html_scraper.py | 189 ++-------------------------
 src/plugins/archives/rusneb.py       |  64 +++++++++
 src/plugins/archives/shpl.py         |  63 +++++++++
 tests/test_archives.py               |  71 ++++++----
 7 files changed, 261 insertions(+), 227 deletions(-)
 create mode 100644 src/plugins/archives/alib.py
 create mode 100644 src/plugins/archives/rusneb.py
 create mode 100644 src/plugins/archives/shpl.py
diff --git a/config/functions.default.yaml b/config/functions.default.yaml
index 8b2530a..3f35a3a 100644
--- a/config/functions.default.yaml
+++ b/config/functions.default.yaml
@@ -57,28 +57,17 @@ functions:
 
     rusneb:
       name: "НЭБ"
-      type: html_scraper
+      type: rusneb
       auto_queue: true
       rate_limit_seconds: 5
       timeout: 8
-      config:
-        url: "https://rusneb.ru/search/"
-        search_param: q
-        img_alt: true
-        author_class: "search-list__item_subtext"
 
     alib_web:
       name: "Alib (web)"
-      type: html_scraper
+      type: alib_web
       auto_queue: false
       rate_limit_seconds: 5
       timeout: 8
-      config:
-        url: "https://www.alib.ru/find3.php4"
-        search_param: tfind
-        extra_params: {f: "5", s: "0"}
-        encoding: "cp1251"
-        bold_text: true
 
     nlr:
       name: "НЛР"
@@ -91,13 +80,9 @@ functions:
         query_prefix: "title="
 
     shpl:
+      # Endpoint currently returns HTTP 404; retained for future re-enablement.
       name: "ШПИЛ"
-      type: html_scraper
+      type: shpl
       auto_queue: false
       rate_limit_seconds: 5
       timeout: 8
-      config:
-        url: "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
-        search_param: S21ALL
-        extra_params: {C21COM: S, I21DBN: BIBL, P21DBN: BIBL, S21FMT: briefWebRus, Z21ID: ""}
-        brief_class: "brief"
diff --git a/src/plugins/__init__.py b/src/plugins/__init__.py
index d56d72e..8c8da2a 100644
--- a/src/plugins/__init__.py
+++ b/src/plugins/__init__.py
@@ -41,16 +41,20 @@ _type_to_class: dict[str, Any] = {}  # populated lazily on first call
 
 def _archive_classes() -> dict[str, Any]:
     if not _type_to_class:
-        from .archives.html_scraper import HtmlScraperPlugin
+        from .archives.alib import AlibPlugin
         from .archives.openlibrary import OpenLibraryPlugin
         from .archives.rsl import RSLPlugin
+        from .archives.rusneb import RusnebPlugin
+        from .archives.shpl import ShplPlugin
         from .archives.sru_catalog import SRUCatalogPlugin
 
         _type_to_class.update(
             {
                 "openlibrary": OpenLibraryPlugin,
                 "rsl": RSLPlugin,
-                "html_scraper": HtmlScraperPlugin,
+                "rusneb": RusnebPlugin,
+                "alib_web": AlibPlugin,
+                "shpl": ShplPlugin,
                 "sru_catalog": SRUCatalogPlugin,
             }
         )
diff --git a/src/plugins/archives/alib.py b/src/plugins/archives/alib.py
new file mode 100644
index 0000000..5360a1f
--- /dev/null
+++ b/src/plugins/archives/alib.py
@@ -0,0 +1,70 @@
+"""Alib (alib.ru) archive search plugin."""
+
+import re
+from urllib.parse import quote
+
+import httpx
+
+from models import CandidateRecord
+
+from .html_scraper import AUTHOR_PREFIX_PAT, YEAR_RE, HtmlScraperPlugin
+
+_URL = "https://www.alib.ru/find3.php4"
+_DOMAIN = "www.alib.ru"
+_ENCODING = "cp1251"
+_EXTRA_PARAMS: dict[str, str] = {"f": "5", "s": "0"}
+
+# Book entries appear as <p><b>Author Title Year Publisher…</b>
+_ENTRY_RE = re.compile(r"<p><b>([^<]{5,200})</b>")
+
+
+class AlibPlugin(HtmlScraperPlugin):
+    """Archive searcher for alib.ru.
+
+    Fetches search results with Windows-1251 encoding and extracts book records
+    from ``<p><b>Author Title Year...</b>`` entries.  Author surname and initials
+    are split from the remaining text using a Cyrillic/Latin initial pattern.
+    Year is extracted from within each entry rather than from the page globally.
+    """
+
+    def search(self, query: str) -> list[CandidateRecord]:
+        """Search Alib for books matching query.
+
+        Args:
+            query: Free-text search string.
+
+        Returns:
+            Up to three CandidateRecord dicts with source, title, author, year,
+            isbn, and publisher fields.
+        """
+        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+        q_enc = quote(query.encode(_ENCODING, "replace"))
+        ep: dict[str, str] = dict(_EXTRA_PARAMS)
+        ep_parts = [f"{k}={quote(str(v).encode(_ENCODING, 'replace'))}" for k, v in ep.items()]
+        raw_qs = "&".join([f"tfind={q_enc}"] + ep_parts)
+        r = httpx.get(f"{_URL}?{raw_qs}", timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+        html = r.content.decode(_ENCODING, errors="replace")
+
+        out: list[CandidateRecord] = []
+        for entry in _ENTRY_RE.findall(html)[:3]:
+            text = entry.strip()
+            year_m = YEAR_RE.search(text)
+            year = year_m.group(0) if year_m else ""
+            m = AUTHOR_PREFIX_PAT.match(text)
+            if m:
+                author = m.group(1).strip()
+                title = m.group(2).strip()
+            else:
+                author = ""
+                title = text
+            out.append(
+                CandidateRecord(
+                    source=self.plugin_id,
+                    title=title,
+                    author=author,
+                    year=year,
+                    isbn="",
+                    publisher="",
+                )
+            )
+        return out
diff --git a/src/plugins/archives/html_scraper.py b/src/plugins/archives/html_scraper.py
index a3b17d0..b3e1aec 100644
--- a/src/plugins/archives/html_scraper.py
+++ b/src/plugins/archives/html_scraper.py
@@ -1,27 +1,17 @@
-"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
+"""Base class and shared HTML parsing utilities for archive scraper plugins."""
 
 import re
 from typing import Any
-from urllib.parse import quote, urlparse
-
-import httpx
 
 from models import CandidateRecord
 
 from ..rate_limiter import RateLimiter
 
-_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
-
-# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
-_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
+YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
+AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
 
 
-def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
-    # Support both single and double-quoted class attributes.
-    return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
-
-
-def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
+def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
     """Extract text content from elements whose class contains cls_frag.
 
     Strips inner HTML tags and normalises whitespace, so elements like
@@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int =
     return out
 
 
-def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
+def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
     """Extract non-empty alt attributes from <img> tags, normalising whitespace.
 
     Args:
@@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
 
 
 class HtmlScraperPlugin:
-    """Config-driven HTML scraper.
+    """Base class for HTML-scraping archive plugins.
 
-    Supported config keys:
-      url               — search URL
-      search_param      — query param name
-      extra_params      — dict of fixed extra query parameters
-      encoding          — character encoding for query and response (e.g. "cp1251")
-      title_class       — CSS class fragment for title elements (class-based strategy)
-      author_class      — CSS class fragment for author elements
-      link_href_pattern — href regex to find title <a> links (link strategy)
-      brief_class       — CSS class for brief record rows (brief strategy)
-      img_alt           — truthy: extract titles from <img alt> attributes (rusneb strategy)
-      bold_text         — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
+    Handles common initialisation; subclasses implement search() with
+    site-specific hardcoded logic.  The config dict is accepted for
+    registry compatibility but is not used by the base class; all scraping
+    details are hardcoded in the subclass.
     """
 
     category = "archive_searchers"
@@ -104,163 +87,15 @@ class HtmlScraperPlugin:
         self.rate_limit_seconds = rate_limit_seconds
         self.auto_queue = auto_queue
         self.timeout = timeout
-        self.config = config
-        self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
 
     def search(self, query: str) -> list[CandidateRecord]:
         """Search for books matching query.
 
         Args:
-            query: Free-text search string (author, title, keywords).
+            query: Free-text search string.
 
         Returns:
             Up to three CandidateRecord dicts with source, title, author, year,
             isbn, and publisher fields.
         """
-        cfg = self.config
-        self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
-
-        encoding = str(cfg.get("encoding") or "")
-        if encoding:
-            # Encode query and extra params in the site's native encoding.
-            q_enc = quote(query.encode(encoding, "replace"))
-            ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
-            ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
-            raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
-            r = httpx.get(
-                f'{cfg["url"]}?{raw_qs}',
-                timeout=self.timeout,
-                headers={"User-Agent": "Mozilla/5.0"},
-            )
-            html = r.content.decode(encoding, errors="replace")
-        else:
-            params: dict[str, Any] = dict(cfg.get("extra_params") or {})
-            params[cfg["search_param"]] = query
-            r = httpx.get(
-                cfg["url"],
-                params=params,
-                timeout=self.timeout,
-                headers={"User-Agent": "Mozilla/5.0"},
-            )
-            html = r.text
-
-        years = _YEAR_RE.findall(html)
-
-        if cfg.get("bold_text"):
-            return self._parse_bold_text(html, years)
-        if cfg.get("img_alt"):
-            return self._parse_img_alt(html, years, cfg)
-        if "link_href_pattern" in cfg:
-            return self._parse_link(html, years, cfg)
-        if "brief_class" in cfg:
-            return self._parse_brief(html, years, cfg)
-        return self._parse_class(html, years, cfg)
-
-    def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
-        """Extract records from ``<p><b>text</b>`` entries (Alib-style).
-
-        The bold text is expected to begin with ``Surname I.N. Title…``; the
-        author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
-
-        Args:
-            html: Decoded HTML response.
-            years: Year strings found in the full HTML (used positionally).
-
-        Returns:
-            Up to three CandidateRecord dicts.
-        """
-        entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
-        out: list[CandidateRecord] = []
-        for i, entry in enumerate(entries):
-            text = entry.strip()
-            m = _AUTHOR_PREFIX_PAT.match(text)
-            if m:
-                author = m.group(1).strip()
-                title = m.group(2).strip()
-            else:
-                author = ""
-                title = text
-            out.append(
-                CandidateRecord(
-                    source=self.plugin_id,
-                    title=title,
-                    author=author,
-                    year=years[i] if i < len(years) else "",
-                    isbn="",
-                    publisher="",
-                )
-            )
-        return out
-
-    def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        """Extract records using ``<img alt>`` for titles and a CSS class for authors.
-
-        Used for sites like rusneb.ru where thumbnail alt attributes carry the
-        book title and a separate span contains the author.
-
-        Args:
-            html: Decoded HTML response.
-            years: Year strings found in the full HTML (used positionally).
-            cfg: Plugin config dict (reads ``author_class``).
-
-        Returns:
-            Up to three CandidateRecord dicts.
-        """
-        titles = _img_alts(html)
-        authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=title,
-                author=authors[i] if i < len(authors) else "",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, title in enumerate(titles)
-        ]
-
-    def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
-        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=title.strip(),
-                author=authors[i].strip() if i < len(authors) else "",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, title in enumerate(titles)
-        ]
-
-    def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        href_pat = cfg.get("link_href_pattern", r"")
-        titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
-        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=title.strip(),
-                author=authors[i].strip() if i < len(authors) else "",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, title in enumerate(titles)
-        ]
-
-    def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=t.strip(),
-                author="",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, t in enumerate(titles)
-        ]
+        raise NotImplementedError
diff --git a/src/plugins/archives/rusneb.py b/src/plugins/archives/rusneb.py
new file mode 100644
index 0000000..4829a36
--- /dev/null
+++ b/src/plugins/archives/rusneb.py
@@ -0,0 +1,64 @@
+"""НЭБ (rusneb.ru) archive search plugin."""
+
+import re
+
+import httpx
+
+from models import CandidateRecord
+
+from .html_scraper import HtmlScraperPlugin, YEAR_RE, cls_inner_texts, img_alts
+
+_URL = "https://rusneb.ru/search/"
+_DOMAIN = "rusneb.ru"
+_AUTHOR_CLASS = "search-list__item_subtext"
+
+# Each search result is a <li> whose class contains search-list__item but not a BEM
+# child element suffix (which would begin with underscore, e.g. __item_subtext).
+_ITEM_RE = re.compile(
+    r'<li[^>]*class=["\'][^"\']*search-list__item(?!_)[^"\']*["\'][^>]*>(.*?)</li>',
+    re.DOTALL,
+)
+
+
+class RusnebPlugin(HtmlScraperPlugin):
+    """Archive searcher for rusneb.ru (НЭБ — Национальная электронная библиотека).
+
+    Extracts book titles from ``<img alt>`` attributes within search result list
+    items and authors from ``.search-list__item_subtext`` spans.  Years are
+    extracted per list item to avoid picking up unrelated page-level dates.
+    """
+
+    def search(self, query: str) -> list[CandidateRecord]:
+        """Search НЭБ for books matching query.
+
+        Args:
+            query: Free-text search string.
+
+        Returns:
+            Up to three CandidateRecord dicts with source, title, author, year,
+            isbn, and publisher fields.
+        """
+        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+        r = httpx.get(_URL, params={"q": query}, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+        html = r.text
+
+        out: list[CandidateRecord] = []
+        for item_html in _ITEM_RE.findall(html):
+            alts = img_alts(item_html)
+            if not alts:
+                continue
+            authors = cls_inner_texts(item_html, _AUTHOR_CLASS, 3, 80)
+            year_m = YEAR_RE.search(item_html)
+            out.append(
+                CandidateRecord(
+                    source=self.plugin_id,
+                    title=alts[0],
+                    author=authors[0] if authors else "",
+                    year=year_m.group(0) if year_m else "",
+                    isbn="",
+                    publisher="",
+                )
+            )
+            if len(out) == 3:
+                break
+        return out
diff --git a/src/plugins/archives/shpl.py b/src/plugins/archives/shpl.py
new file mode 100644
index 0000000..9ff2d58
--- /dev/null
+++ b/src/plugins/archives/shpl.py
@@ -0,0 +1,63 @@
+"""ШПИЛ archive search plugin.
+
+Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin
+produces no results.  The class is retained so the configuration entry can
+be re-enabled if the endpoint is restored.
+"""
+
+import re
+
+import httpx
+
+from models import CandidateRecord
+
+from .html_scraper import YEAR_RE, HtmlScraperPlugin
+
+_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
+_DOMAIN = "www.shpl.ru"
+_EXTRA_PARAMS: dict[str, str] = {
+    "C21COM": "S",
+    "I21DBN": "BIBL",
+    "P21DBN": "BIBL",
+    "S21FMT": "briefWebRus",
+    "Z21ID": "",
+}
+
+_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<')
+
+
+class ShplPlugin(HtmlScraperPlugin):
+    """Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека).
+
+    Extracts brief record entries from elements with class ``brief``.
+    The remote IRBIS64 CGI endpoint is currently offline (HTTP 404).
+    """
+
+    def search(self, query: str) -> list[CandidateRecord]:
+        """Search ШПИЛ for books matching query.
+
+        Args:
+            query: Free-text search string.
+
+        Returns:
+            Up to three CandidateRecord dicts with source, title, author, year,
+            isbn, and publisher fields.
+        """
+        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+        params: dict[str, str] = dict(_EXTRA_PARAMS)
+        params["S21ALL"] = query
+        r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+        html = r.text
+        years = YEAR_RE.findall(html)
+        titles = _BRIEF_RE.findall(html)[:3]
+        return [
+            CandidateRecord(
+                source=self.plugin_id,
+                title=t.strip(),
+                author="",
+                year=years[i] if i < len(years) else "",
+                isbn="",
+                publisher="",
+            )
+            for i, t in enumerate(titles)
+        ]
diff --git a/tests/test_archives.py b/tests/test_archives.py
index c380ead..e932c6c 100644
--- a/tests/test_archives.py
+++ b/tests/test_archives.py
@@ -7,12 +7,16 @@ Run with:  pytest tests/ -m network
 Skip with: pytest tests/ -m "not network"  (default in presubmit)
 """
 
+import re
+
 import pytest
 
 from models import CandidateRecord
-from plugins.archives.html_scraper import HtmlScraperPlugin
+from plugins.archives.alib import AlibPlugin
 from plugins.archives.openlibrary import OpenLibraryPlugin
 from plugins.archives.rsl import RSLPlugin
+from plugins.archives.rusneb import RusnebPlugin
+from plugins.archives.shpl import ShplPlugin
 from plugins.archives.sru_catalog import SRUCatalogPlugin
 from plugins.rate_limiter import RateLimiter
 
@@ -21,6 +25,8 @@ pytestmark = pytest.mark.network
 _RL = RateLimiter()
 _TIMEOUT = 15
 
+_YEAR_PAT = re.compile(r"^\d{4}$")
+
 
 def _titles(results: list[CandidateRecord]) -> list[str]:
     return [r["title"] for r in results]
@@ -30,6 +36,10 @@ def _authors(results: list[CandidateRecord]) -> list[str]:
     return [r["author"] for r in results]
 
 
+def _years(results: list[CandidateRecord]) -> list[str]:
+    return [r["year"] for r in results]
+
+
 def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
     """Return True if any result title contains fragment (case-insensitive)."""
     low = fragment.lower()
@@ -42,6 +52,11 @@ def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
     return any(low in r["author"].lower() for r in results)
 
 
+def _valid_year(year: str) -> bool:
+    """Return True if year is a 4-digit string or empty."""
+    return year == "" or bool(_YEAR_PAT.match(year))
+
+
 # ── OpenLibrary ───────────────────────────────────────────────────────────────
 
 
@@ -61,6 +76,10 @@ def test_openlibrary_war_and_peace() -> None:
     assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
     # OpenLibrary stores authors in their original language; accept both forms.
     assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    # OpenLibrary returns isbn and publisher from its JSON API.
+    assert all(isinstance(r["isbn"], str) for r in results)
+    assert all(isinstance(r["publisher"], str) for r in results)
 
 
 # ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
@@ -80,57 +99,56 @@ def test_rsl_voina_i_mir() -> None:
     assert results, "RSL returned no results"
     assert all(r["source"] == "rsl" for r in results)
     assert _has_title(results, "война"), f"titles={_titles(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)
 
 
 # ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
 
 
 def test_rusneb_voina_i_mir() -> None:
-    plugin = HtmlScraperPlugin(
+    plugin = RusnebPlugin(
         plugin_id="rusneb",
         name="НЭБ",
         rate_limiter=_RL,
         rate_limit_seconds=0,
         auto_queue=True,
         timeout=_TIMEOUT,
-        config={
-            "url": "https://rusneb.ru/search/",
-            "search_param": "q",
-            "img_alt": True,
-            "author_class": "search-list__item_subtext",
-        },
+        config={},
     )
     results = plugin.search("Война и мир Толстой")
     assert results, "НЭБ returned no results"
     assert all(r["source"] == "rusneb" for r in results)
     assert _has_title(results, "война"), f"titles={_titles(results)}"
     assert _has_author(results, "толст"), f"authors={_authors(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)
 
 
 # ── Alib ─────────────────────────────────────────────────────────────────────
 
 
 def test_alib_voina_i_mir() -> None:
-    plugin = HtmlScraperPlugin(
+    plugin = AlibPlugin(
         plugin_id="alib_web",
         name="Alib (web)",
         rate_limiter=_RL,
         rate_limit_seconds=0,
         auto_queue=False,
         timeout=_TIMEOUT,
-        config={
-            "url": "https://www.alib.ru/find3.php4",
-            "search_param": "tfind",
-            "extra_params": {"f": "5", "s": "0"},
-            "encoding": "cp1251",
-            "bold_text": True,
-        },
+        config={},
     )
     results = plugin.search("Война и мир Толстой")
     assert results, "Alib returned no results"
     assert all(r["source"] == "alib_web" for r in results)
     assert _has_title(results, "война"), f"titles={_titles(results)}"
     assert _has_author(results, "толст"), f"authors={_authors(results)}"
+    # Alib entries always include a publication year in the bibliographic text.
+    assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)
 
 
 # ── НЛР (SRU) ────────────────────────────────────────────────────────────────
@@ -155,6 +173,9 @@ def test_nlr_voina_i_mir() -> None:
     assert results, "НЛР returned no results"
     assert all(r["source"] == "nlr" for r in results)
     assert _has_title(results, "война"), f"titles={_titles(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)
 
 
 # ── ШПИЛ ─────────────────────────────────────────────────────────────────────
@@ -163,27 +184,19 @@ def test_nlr_voina_i_mir() -> None:
 
 @pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
 def test_shpl_voina_i_mir() -> None:
-    plugin = HtmlScraperPlugin(
+    plugin = ShplPlugin(
         plugin_id="shpl",
         name="ШПИЛ",
         rate_limiter=_RL,
         rate_limit_seconds=0,
         auto_queue=False,
         timeout=_TIMEOUT,
-        config={
-            "url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
-            "search_param": "S21ALL",
-            "extra_params": {
-                "C21COM": "S",
-                "I21DBN": "BIBL",
-                "P21DBN": "BIBL",
-                "S21FMT": "briefWebRus",
-                "Z21ID": "",
-            },
-            "brief_class": "brief",
-        },
+        config={},
     )
     results = plugin.search("Война и мир")
     assert results, "ШПИЛ returned no results"
     assert all(r["source"] == "shpl" for r in results)
     assert _has_title(results, "война"), f"titles={_titles(results)}"
+    assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+    assert all(r["isbn"] == "" for r in results)
+    assert all(r["publisher"] == "" for r in results)