From fd32be729f531eba12fc36f3338aac1eddd22929 Mon Sep 17 00:00:00 2001 From: Petr Polezhaev Date: Tue, 10 Mar 2026 00:03:17 +0300 Subject: [PATCH] Replace config-driven HtmlScraperPlugin with specific archive classes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each

entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 --- config/functions.default.yaml | 23 +--- src/plugins/__init__.py | 8 +- src/plugins/archives/alib.py | 70 ++++++++++ src/plugins/archives/html_scraper.py | 189 ++------------------------- src/plugins/archives/rusneb.py | 64 +++++++++ src/plugins/archives/shpl.py | 63 +++++++++ tests/test_archives.py | 71 ++++++---- 7 files changed, 261 insertions(+), 227 deletions(-) create mode 100644 src/plugins/archives/alib.py create mode 100644 src/plugins/archives/rusneb.py create mode 100644 src/plugins/archives/shpl.py diff --git a/config/functions.default.yaml b/config/functions.default.yaml index 8b2530a..3f35a3a 100644 --- a/config/functions.default.yaml +++ b/config/functions.default.yaml @@ -57,28 +57,17 @@ functions: rusneb: name: "НЭБ" - type: html_scraper + type: rusneb auto_queue: true rate_limit_seconds: 5 timeout: 8 - config: - url: "https://rusneb.ru/search/" - search_param: q - img_alt: true - author_class: "search-list__item_subtext" alib_web: name: "Alib (web)" - type: html_scraper + type: alib_web auto_queue: false rate_limit_seconds: 5 timeout: 8 - config: - url: "https://www.alib.ru/find3.php4" - search_param: tfind - extra_params: {f: "5", s: "0"} - encoding: "cp1251" - bold_text: true nlr: name: "НЛР" @@ -91,13 +80,9 @@ functions: query_prefix: "title=" shpl: + # Endpoint currently returns HTTP 404; retained for future re-enablement. name: "ШПИЛ" - type: html_scraper + type: shpl auto_queue: false rate_limit_seconds: 5 timeout: 8 - config: - url: "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe" - search_param: S21ALL - extra_params: {C21COM: S, I21DBN: BIBL, P21DBN: BIBL, S21FMT: briefWebRus, Z21ID: ""} - brief_class: "brief" diff --git a/src/plugins/__init__.py b/src/plugins/__init__.py index d56d72e..8c8da2a 100644 --- a/src/plugins/__init__.py +++ b/src/plugins/__init__.py @@ -41,16 +41,20 @@ _type_to_class: dict[str, Any] = {} # populated lazily on first call def _archive_classes() -> dict[str, Any]: if not _type_to_class: - from .archives.html_scraper import HtmlScraperPlugin + from .archives.alib import AlibPlugin from .archives.openlibrary import OpenLibraryPlugin from .archives.rsl import RSLPlugin + from .archives.rusneb import RusnebPlugin + from .archives.shpl import ShplPlugin from .archives.sru_catalog import SRUCatalogPlugin _type_to_class.update( { "openlibrary": OpenLibraryPlugin, "rsl": RSLPlugin, - "html_scraper": HtmlScraperPlugin, + "rusneb": RusnebPlugin, + "alib_web": AlibPlugin, + "shpl": ShplPlugin, "sru_catalog": SRUCatalogPlugin, } ) diff --git a/src/plugins/archives/alib.py b/src/plugins/archives/alib.py new file mode 100644 index 0000000..5360a1f --- /dev/null +++ b/src/plugins/archives/alib.py @@ -0,0 +1,70 @@ +"""Alib (alib.ru) archive search plugin.""" + +import re +from urllib.parse import quote + +import httpx + +from models import CandidateRecord + +from .html_scraper import AUTHOR_PREFIX_PAT, YEAR_RE, HtmlScraperPlugin + +_URL = "https://www.alib.ru/find3.php4" +_DOMAIN = "www.alib.ru" +_ENCODING = "cp1251" +_EXTRA_PARAMS: dict[str, str] = {"f": "5", "s": "0"} + +# Book entries appear as

Author Title Year Publisher… +_ENTRY_RE = re.compile(r"

([^<]{5,200})") + + +class AlibPlugin(HtmlScraperPlugin): + """Archive searcher for alib.ru. + + Fetches search results with Windows-1251 encoding and extracts book records + from ``

Author Title Year...`` entries. Author surname and initials + are split from the remaining text using a Cyrillic/Latin initial pattern. + Year is extracted from within each entry rather than from the page globally. + """ + + def search(self, query: str) -> list[CandidateRecord]: + """Search Alib for books matching query. + + Args: + query: Free-text search string. + + Returns: + Up to three CandidateRecord dicts with source, title, author, year, + isbn, and publisher fields. + """ + self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds) + q_enc = quote(query.encode(_ENCODING, "replace")) + ep: dict[str, str] = dict(_EXTRA_PARAMS) + ep_parts = [f"{k}={quote(str(v).encode(_ENCODING, 'replace'))}" for k, v in ep.items()] + raw_qs = "&".join([f"tfind={q_enc}"] + ep_parts) + r = httpx.get(f"{_URL}?{raw_qs}", timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"}) + html = r.content.decode(_ENCODING, errors="replace") + + out: list[CandidateRecord] = [] + for entry in _ENTRY_RE.findall(html)[:3]: + text = entry.strip() + year_m = YEAR_RE.search(text) + year = year_m.group(0) if year_m else "" + m = AUTHOR_PREFIX_PAT.match(text) + if m: + author = m.group(1).strip() + title = m.group(2).strip() + else: + author = "" + title = text + out.append( + CandidateRecord( + source=self.plugin_id, + title=title, + author=author, + year=year, + isbn="", + publisher="", + ) + ) + return out diff --git a/src/plugins/archives/html_scraper.py b/src/plugins/archives/html_scraper.py index a3b17d0..b3e1aec 100644 --- a/src/plugins/archives/html_scraper.py +++ b/src/plugins/archives/html_scraper.py @@ -1,27 +1,17 @@ -"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.).""" +"""Base class and shared HTML parsing utilities for archive scraper plugins.""" import re from typing import Any -from urllib.parse import quote, urlparse - -import httpx from models import CandidateRecord from ..rate_limiter import RateLimiter -_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b") - -# Matches "Surname I.N. " or "Surname I. " at the start of an entry. -_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL) +YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b") +AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL) -def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]: - # Support both single and double-quoted class attributes. - return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<') - - -def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]: +def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]: """Extract text content from elements whose class contains cls_frag. Strips inner HTML tags and normalises whitespace, so elements like @@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = return out -def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]: +def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]: """Extract non-empty alt attributes from tags, normalising whitespace. Args: @@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]: class HtmlScraperPlugin: - """Config-driven HTML scraper. + """Base class for HTML-scraping archive plugins. - Supported config keys: - url — search URL - search_param — query param name - extra_params — dict of fixed extra query parameters - encoding — character encoding for query and response (e.g. "cp1251") - title_class — CSS class fragment for title elements (class-based strategy) - author_class — CSS class fragment for author elements - link_href_pattern — href regex to find title links (link strategy) - brief_class — CSS class for brief record rows (brief strategy) - img_alt — truthy: extract titles from attributes (rusneb strategy) - bold_text — truthy: extract author/title from

blocks (alib strategy) + Handles common initialisation; subclasses implement search() with + site-specific hardcoded logic. The config dict is accepted for + registry compatibility but is not used by the base class; all scraping + details are hardcoded in the subclass. """ category = "archive_searchers" @@ -104,163 +87,15 @@ class HtmlScraperPlugin: self.rate_limit_seconds = rate_limit_seconds self.auto_queue = auto_queue self.timeout = timeout - self.config = config - self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id def search(self, query: str) -> list[CandidateRecord]: """Search for books matching query. Args: - query: Free-text search string (author, title, keywords). + query: Free-text search string. Returns: Up to three CandidateRecord dicts with source, title, author, year, isbn, and publisher fields. """ - cfg = self.config - self._rl.wait_and_record(self._domain, self.rate_limit_seconds) - - encoding = str(cfg.get("encoding") or "") - if encoding: - # Encode query and extra params in the site's native encoding. - q_enc = quote(query.encode(encoding, "replace")) - ep: dict[str, Any] = dict(cfg.get("extra_params") or {}) - ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()] - raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts) - r = httpx.get( - f'{cfg["url"]}?{raw_qs}', - timeout=self.timeout, - headers={"User-Agent": "Mozilla/5.0"}, - ) - html = r.content.decode(encoding, errors="replace") - else: - params: dict[str, Any] = dict(cfg.get("extra_params") or {}) - params[cfg["search_param"]] = query - r = httpx.get( - cfg["url"], - params=params, - timeout=self.timeout, - headers={"User-Agent": "Mozilla/5.0"}, - ) - html = r.text - - years = _YEAR_RE.findall(html) - - if cfg.get("bold_text"): - return self._parse_bold_text(html, years) - if cfg.get("img_alt"): - return self._parse_img_alt(html, years, cfg) - if "link_href_pattern" in cfg: - return self._parse_link(html, years, cfg) - if "brief_class" in cfg: - return self._parse_brief(html, years, cfg) - return self._parse_class(html, years, cfg) - - def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]: - """Extract records from ``

text`` entries (Alib-style). - - The bold text is expected to begin with ``Surname I.N. Title…``; the - author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible. - - Args: - html: Decoded HTML response. - years: Year strings found in the full HTML (used positionally). - - Returns: - Up to three CandidateRecord dicts. - """ - entries = re.findall(r"

([^<]{5,200})", html)[:3] - out: list[CandidateRecord] = [] - for i, entry in enumerate(entries): - text = entry.strip() - m = _AUTHOR_PREFIX_PAT.match(text) - if m: - author = m.group(1).strip() - title = m.group(2).strip() - else: - author = "" - title = text - out.append( - CandidateRecord( - source=self.plugin_id, - title=title, - author=author, - year=years[i] if i < len(years) else "", - isbn="", - publisher="", - ) - ) - return out - - def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]: - """Extract records using ```` for titles and a CSS class for authors. - - Used for sites like rusneb.ru where thumbnail alt attributes carry the - book title and a separate span contains the author. - - Args: - html: Decoded HTML response. - years: Year strings found in the full HTML (used positionally). - cfg: Plugin config dict (reads ``author_class``). - - Returns: - Up to three CandidateRecord dicts. - """ - titles = _img_alts(html) - authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80) - return [ - CandidateRecord( - source=self.plugin_id, - title=title, - author=authors[i] if i < len(authors) else "", - year=years[i] if i < len(years) else "", - isbn="", - publisher="", - ) - for i, title in enumerate(titles) - ] - - def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]: - titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3] - authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3] - return [ - CandidateRecord( - source=self.plugin_id, - title=title.strip(), - author=authors[i].strip() if i < len(authors) else "", - year=years[i] if i < len(years) else "", - isbn="", - publisher="", - ) - for i, title in enumerate(titles) - ] - - def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]: - href_pat = cfg.get("link_href_pattern", r"") - titles = re.findall(rf']+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})', html)[:3] - authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3] - return [ - CandidateRecord( - source=self.plugin_id, - title=title.strip(), - author=authors[i].strip() if i < len(authors) else "", - year=years[i] if i < len(years) else "", - isbn="", - publisher="", - ) - for i, title in enumerate(titles) - ] - - def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]: - titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3] - return [ - CandidateRecord( - source=self.plugin_id, - title=t.strip(), - author="", - year=years[i] if i < len(years) else "", - isbn="", - publisher="", - ) - for i, t in enumerate(titles) - ] + raise NotImplementedError diff --git a/src/plugins/archives/rusneb.py b/src/plugins/archives/rusneb.py new file mode 100644 index 0000000..4829a36 --- /dev/null +++ b/src/plugins/archives/rusneb.py @@ -0,0 +1,64 @@ +"""НЭБ (rusneb.ru) archive search plugin.""" + +import re + +import httpx + +from models import CandidateRecord + +from .html_scraper import HtmlScraperPlugin, YEAR_RE, cls_inner_texts, img_alts + +_URL = "https://rusneb.ru/search/" +_DOMAIN = "rusneb.ru" +_AUTHOR_CLASS = "search-list__item_subtext" + +# Each search result is a

  • whose class contains search-list__item but not a BEM +# child element suffix (which would begin with underscore, e.g. __item_subtext). +_ITEM_RE = re.compile( + r']*class=["\'][^"\']*search-list__item(?!_)[^"\']*["\'][^>]*>(.*?)
  • ', + re.DOTALL, +) + + +class RusnebPlugin(HtmlScraperPlugin): + """Archive searcher for rusneb.ru (НЭБ — Национальная электронная библиотека). + + Extracts book titles from ```` attributes within search result list + items and authors from ``.search-list__item_subtext`` spans. Years are + extracted per list item to avoid picking up unrelated page-level dates. + """ + + def search(self, query: str) -> list[CandidateRecord]: + """Search НЭБ for books matching query. + + Args: + query: Free-text search string. + + Returns: + Up to three CandidateRecord dicts with source, title, author, year, + isbn, and publisher fields. + """ + self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds) + r = httpx.get(_URL, params={"q": query}, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"}) + html = r.text + + out: list[CandidateRecord] = [] + for item_html in _ITEM_RE.findall(html): + alts = img_alts(item_html) + if not alts: + continue + authors = cls_inner_texts(item_html, _AUTHOR_CLASS, 3, 80) + year_m = YEAR_RE.search(item_html) + out.append( + CandidateRecord( + source=self.plugin_id, + title=alts[0], + author=authors[0] if authors else "", + year=year_m.group(0) if year_m else "", + isbn="", + publisher="", + ) + ) + if len(out) == 3: + break + return out diff --git a/src/plugins/archives/shpl.py b/src/plugins/archives/shpl.py new file mode 100644 index 0000000..9ff2d58 --- /dev/null +++ b/src/plugins/archives/shpl.py @@ -0,0 +1,63 @@ +"""ШПИЛ archive search plugin. + +Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin +produces no results. The class is retained so the configuration entry can +be re-enabled if the endpoint is restored. +""" + +import re + +import httpx + +from models import CandidateRecord + +from .html_scraper import YEAR_RE, HtmlScraperPlugin + +_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe" +_DOMAIN = "www.shpl.ru" +_EXTRA_PARAMS: dict[str, str] = { + "C21COM": "S", + "I21DBN": "BIBL", + "P21DBN": "BIBL", + "S21FMT": "briefWebRus", + "Z21ID": "", +} + +_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<') + + +class ShplPlugin(HtmlScraperPlugin): + """Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека). + + Extracts brief record entries from elements with class ``brief``. + The remote IRBIS64 CGI endpoint is currently offline (HTTP 404). + """ + + def search(self, query: str) -> list[CandidateRecord]: + """Search ШПИЛ for books matching query. + + Args: + query: Free-text search string. + + Returns: + Up to three CandidateRecord dicts with source, title, author, year, + isbn, and publisher fields. + """ + self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds) + params: dict[str, str] = dict(_EXTRA_PARAMS) + params["S21ALL"] = query + r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"}) + html = r.text + years = YEAR_RE.findall(html) + titles = _BRIEF_RE.findall(html)[:3] + return [ + CandidateRecord( + source=self.plugin_id, + title=t.strip(), + author="", + year=years[i] if i < len(years) else "", + isbn="", + publisher="", + ) + for i, t in enumerate(titles) + ] diff --git a/tests/test_archives.py b/tests/test_archives.py index c380ead..e932c6c 100644 --- a/tests/test_archives.py +++ b/tests/test_archives.py @@ -7,12 +7,16 @@ Run with: pytest tests/ -m network Skip with: pytest tests/ -m "not network" (default in presubmit) """ +import re + import pytest from models import CandidateRecord -from plugins.archives.html_scraper import HtmlScraperPlugin +from plugins.archives.alib import AlibPlugin from plugins.archives.openlibrary import OpenLibraryPlugin from plugins.archives.rsl import RSLPlugin +from plugins.archives.rusneb import RusnebPlugin +from plugins.archives.shpl import ShplPlugin from plugins.archives.sru_catalog import SRUCatalogPlugin from plugins.rate_limiter import RateLimiter @@ -21,6 +25,8 @@ pytestmark = pytest.mark.network _RL = RateLimiter() _TIMEOUT = 15 +_YEAR_PAT = re.compile(r"^\d{4}$") + def _titles(results: list[CandidateRecord]) -> list[str]: return [r["title"] for r in results] @@ -30,6 +36,10 @@ def _authors(results: list[CandidateRecord]) -> list[str]: return [r["author"] for r in results] +def _years(results: list[CandidateRecord]) -> list[str]: + return [r["year"] for r in results] + + def _has_title(results: list[CandidateRecord], fragment: str) -> bool: """Return True if any result title contains fragment (case-insensitive).""" low = fragment.lower() @@ -42,6 +52,11 @@ def _has_author(results: list[CandidateRecord], fragment: str) -> bool: return any(low in r["author"].lower() for r in results) +def _valid_year(year: str) -> bool: + """Return True if year is a 4-digit string or empty.""" + return year == "" or bool(_YEAR_PAT.match(year)) + + # ── OpenLibrary ─────────────────────────────────────────────────────────────── @@ -61,6 +76,10 @@ def test_openlibrary_war_and_peace() -> None: assert _has_title(results, "war and peace"), f"titles={_titles(results)}" # OpenLibrary stores authors in their original language; accept both forms. assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}" + assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}" + # OpenLibrary returns isbn and publisher from its JSON API. + assert all(isinstance(r["isbn"], str) for r in results) + assert all(isinstance(r["publisher"], str) for r in results) # ── RSL (РГБ) ───────────────────────────────────────────────────────────────── @@ -80,57 +99,56 @@ def test_rsl_voina_i_mir() -> None: assert results, "RSL returned no results" assert all(r["source"] == "rsl" for r in results) assert _has_title(results, "война"), f"titles={_titles(results)}" + assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}" + assert all(r["isbn"] == "" for r in results) + assert all(r["publisher"] == "" for r in results) # ── НЭБ (rusneb) ───────────────────────────────────────────────────────────── def test_rusneb_voina_i_mir() -> None: - plugin = HtmlScraperPlugin( + plugin = RusnebPlugin( plugin_id="rusneb", name="НЭБ", rate_limiter=_RL, rate_limit_seconds=0, auto_queue=True, timeout=_TIMEOUT, - config={ - "url": "https://rusneb.ru/search/", - "search_param": "q", - "img_alt": True, - "author_class": "search-list__item_subtext", - }, + config={}, ) results = plugin.search("Война и мир Толстой") assert results, "НЭБ returned no results" assert all(r["source"] == "rusneb" for r in results) assert _has_title(results, "война"), f"titles={_titles(results)}" assert _has_author(results, "толст"), f"authors={_authors(results)}" + assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}" + assert all(r["isbn"] == "" for r in results) + assert all(r["publisher"] == "" for r in results) # ── Alib ───────────────────────────────────────────────────────────────────── def test_alib_voina_i_mir() -> None: - plugin = HtmlScraperPlugin( + plugin = AlibPlugin( plugin_id="alib_web", name="Alib (web)", rate_limiter=_RL, rate_limit_seconds=0, auto_queue=False, timeout=_TIMEOUT, - config={ - "url": "https://www.alib.ru/find3.php4", - "search_param": "tfind", - "extra_params": {"f": "5", "s": "0"}, - "encoding": "cp1251", - "bold_text": True, - }, + config={}, ) results = plugin.search("Война и мир Толстой") assert results, "Alib returned no results" assert all(r["source"] == "alib_web" for r in results) assert _has_title(results, "война"), f"titles={_titles(results)}" assert _has_author(results, "толст"), f"authors={_authors(results)}" + # Alib entries always include a publication year in the bibliographic text. + assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}" + assert all(r["isbn"] == "" for r in results) + assert all(r["publisher"] == "" for r in results) # ── НЛР (SRU) ──────────────────────────────────────────────────────────────── @@ -155,6 +173,9 @@ def test_nlr_voina_i_mir() -> None: assert results, "НЛР returned no results" assert all(r["source"] == "nlr" for r in results) assert _has_title(results, "война"), f"titles={_titles(results)}" + assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}" + assert all(r["isbn"] == "" for r in results) + assert all(r["publisher"] == "" for r in results) # ── ШПИЛ ───────────────────────────────────────────────────────────────────── @@ -163,27 +184,19 @@ def test_nlr_voina_i_mir() -> None: @pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False) def test_shpl_voina_i_mir() -> None: - plugin = HtmlScraperPlugin( + plugin = ShplPlugin( plugin_id="shpl", name="ШПИЛ", rate_limiter=_RL, rate_limit_seconds=0, auto_queue=False, timeout=_TIMEOUT, - config={ - "url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe", - "search_param": "S21ALL", - "extra_params": { - "C21COM": "S", - "I21DBN": "BIBL", - "P21DBN": "BIBL", - "S21FMT": "briefWebRus", - "Z21ID": "", - }, - "brief_class": "brief", - }, + config={}, ) results = plugin.search("Война и мир") assert results, "ШПИЛ returned no results" assert all(r["source"] == "shpl" for r in results) assert _has_title(results, "война"), f"titles={_titles(results)}" + assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}" + assert all(r["isbn"] == "" for r in results) + assert all(r["publisher"] == "" for r in results)