diff --git a/config/functions.default.yaml b/config/functions.default.yaml index 8b2530a..3f35a3a 100644 --- a/config/functions.default.yaml +++ b/config/functions.default.yaml @@ -57,28 +57,17 @@ functions: rusneb: name: "НЭБ" - type: html_scraper + type: rusneb auto_queue: true rate_limit_seconds: 5 timeout: 8 - config: - url: "https://rusneb.ru/search/" - search_param: q - img_alt: true - author_class: "search-list__item_subtext" alib_web: name: "Alib (web)" - type: html_scraper + type: alib_web auto_queue: false rate_limit_seconds: 5 timeout: 8 - config: - url: "https://www.alib.ru/find3.php4" - search_param: tfind - extra_params: {f: "5", s: "0"} - encoding: "cp1251" - bold_text: true nlr: name: "НЛР" @@ -91,13 +80,9 @@ functions: query_prefix: "title=" shpl: + # Endpoint currently returns HTTP 404; retained for future re-enablement. name: "ШПИЛ" - type: html_scraper + type: shpl auto_queue: false rate_limit_seconds: 5 timeout: 8 - config: - url: "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe" - search_param: S21ALL - extra_params: {C21COM: S, I21DBN: BIBL, P21DBN: BIBL, S21FMT: briefWebRus, Z21ID: ""} - brief_class: "brief" diff --git a/src/plugins/__init__.py b/src/plugins/__init__.py index d56d72e..8c8da2a 100644 --- a/src/plugins/__init__.py +++ b/src/plugins/__init__.py @@ -41,16 +41,20 @@ _type_to_class: dict[str, Any] = {} # populated lazily on first call def _archive_classes() -> dict[str, Any]: if not _type_to_class: - from .archives.html_scraper import HtmlScraperPlugin + from .archives.alib import AlibPlugin from .archives.openlibrary import OpenLibraryPlugin from .archives.rsl import RSLPlugin + from .archives.rusneb import RusnebPlugin + from .archives.shpl import ShplPlugin from .archives.sru_catalog import SRUCatalogPlugin _type_to_class.update( { "openlibrary": OpenLibraryPlugin, "rsl": RSLPlugin, - "html_scraper": HtmlScraperPlugin, + "rusneb": RusnebPlugin, + "alib_web": AlibPlugin, + "shpl": ShplPlugin, "sru_catalog": SRUCatalogPlugin, } ) diff --git a/src/plugins/archives/alib.py b/src/plugins/archives/alib.py new file mode 100644 index 0000000..5360a1f --- /dev/null +++ b/src/plugins/archives/alib.py @@ -0,0 +1,70 @@ +"""Alib (alib.ru) archive search plugin.""" + +import re +from urllib.parse import quote + +import httpx + +from models import CandidateRecord + +from .html_scraper import AUTHOR_PREFIX_PAT, YEAR_RE, HtmlScraperPlugin + +_URL = "https://www.alib.ru/find3.php4" +_DOMAIN = "www.alib.ru" +_ENCODING = "cp1251" +_EXTRA_PARAMS: dict[str, str] = {"f": "5", "s": "0"} + +# Book entries appear as
Author Title Year Publisher… +_ENTRY_RE = re.compile(r"
([^<]{5,200})") + + +class AlibPlugin(HtmlScraperPlugin): + """Archive searcher for alib.ru. + + Fetches search results with Windows-1251 encoding and extracts book records + from ``
Author Title Year...`` entries. Author surname and initials
+ are split from the remaining text using a Cyrillic/Latin initial pattern.
+ Year is extracted from within each entry rather than from the page globally.
+ """
+
+ def search(self, query: str) -> list[CandidateRecord]:
+ """Search Alib for books matching query.
+
+ Args:
+ query: Free-text search string.
+
+ Returns:
+ Up to three CandidateRecord dicts with source, title, author, year,
+ isbn, and publisher fields.
+ """
+ self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+ q_enc = quote(query.encode(_ENCODING, "replace"))
+ ep: dict[str, str] = dict(_EXTRA_PARAMS)
+ ep_parts = [f"{k}={quote(str(v).encode(_ENCODING, 'replace'))}" for k, v in ep.items()]
+ raw_qs = "&".join([f"tfind={q_enc}"] + ep_parts)
+ r = httpx.get(f"{_URL}?{raw_qs}", timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+ html = r.content.decode(_ENCODING, errors="replace")
+
+ out: list[CandidateRecord] = []
+ for entry in _ENTRY_RE.findall(html)[:3]:
+ text = entry.strip()
+ year_m = YEAR_RE.search(text)
+ year = year_m.group(0) if year_m else ""
+ m = AUTHOR_PREFIX_PAT.match(text)
+ if m:
+ author = m.group(1).strip()
+ title = m.group(2).strip()
+ else:
+ author = ""
+ title = text
+ out.append(
+ CandidateRecord(
+ source=self.plugin_id,
+ title=title,
+ author=author,
+ year=year,
+ isbn="",
+ publisher="",
+ )
+ )
+ return out
diff --git a/src/plugins/archives/html_scraper.py b/src/plugins/archives/html_scraper.py
index a3b17d0..b3e1aec 100644
--- a/src/plugins/archives/html_scraper.py
+++ b/src/plugins/archives/html_scraper.py
@@ -1,27 +1,17 @@
-"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
+"""Base class and shared HTML parsing utilities for archive scraper plugins."""
import re
from typing import Any
-from urllib.parse import quote, urlparse
-
-import httpx
from models import CandidateRecord
from ..rate_limiter import RateLimiter
-_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
-
-# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
-_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
+YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
+AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
-def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
- # Support both single and double-quoted class attributes.
- return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
-
-
-def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
+def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
"""Extract text content from elements whose class contains cls_frag.
Strips inner HTML tags and normalises whitespace, so elements like
@@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int =
return out
-def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
+def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
"""Extract non-empty alt attributes from … blocks (alib strategy)
+ Handles common initialisation; subclasses implement search() with
+ site-specific hardcoded logic. The config dict is accepted for
+ registry compatibility but is not used by the base class; all scraping
+ details are hardcoded in the subclass.
"""
category = "archive_searchers"
@@ -104,163 +87,15 @@ class HtmlScraperPlugin:
self.rate_limit_seconds = rate_limit_seconds
self.auto_queue = auto_queue
self.timeout = timeout
- self.config = config
- self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
def search(self, query: str) -> list[CandidateRecord]:
"""Search for books matching query.
Args:
- query: Free-text search string (author, title, keywords).
+ query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
- cfg = self.config
- self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
-
- encoding = str(cfg.get("encoding") or "")
- if encoding:
- # Encode query and extra params in the site's native encoding.
- q_enc = quote(query.encode(encoding, "replace"))
- ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
- ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
- raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
- r = httpx.get(
- f'{cfg["url"]}?{raw_qs}',
- timeout=self.timeout,
- headers={"User-Agent": "Mozilla/5.0"},
- )
- html = r.content.decode(encoding, errors="replace")
- else:
- params: dict[str, Any] = dict(cfg.get("extra_params") or {})
- params[cfg["search_param"]] = query
- r = httpx.get(
- cfg["url"],
- params=params,
- timeout=self.timeout,
- headers={"User-Agent": "Mozilla/5.0"},
- )
- html = r.text
-
- years = _YEAR_RE.findall(html)
-
- if cfg.get("bold_text"):
- return self._parse_bold_text(html, years)
- if cfg.get("img_alt"):
- return self._parse_img_alt(html, years, cfg)
- if "link_href_pattern" in cfg:
- return self._parse_link(html, years, cfg)
- if "brief_class" in cfg:
- return self._parse_brief(html, years, cfg)
- return self._parse_class(html, years, cfg)
-
- def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
- """Extract records from `` text`` entries (Alib-style).
-
- The bold text is expected to begin with ``Surname I.N. Title…``; the
- author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
-
- Args:
- html: Decoded HTML response.
- years: Year strings found in the full HTML (used positionally).
-
- Returns:
- Up to three CandidateRecord dicts.
- """
- entries = re.findall(r" ([^<]{5,200})", html)[:3]
- out: list[CandidateRecord] = []
- for i, entry in enumerate(entries):
- text = entry.strip()
- m = _AUTHOR_PREFIX_PAT.match(text)
- if m:
- author = m.group(1).strip()
- title = m.group(2).strip()
- else:
- author = ""
- title = text
- out.append(
- CandidateRecord(
- source=self.plugin_id,
- title=title,
- author=author,
- year=years[i] if i < len(years) else "",
- isbn="",
- publisher="",
- )
- )
- return out
-
- def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
- """Extract records using `` tags, normalising whitespace.
Args:
@@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
class HtmlScraperPlugin:
- """Config-driven HTML scraper.
+ """Base class for HTML-scraping archive plugins.
- Supported config keys:
- url — search URL
- search_param — query param name
- extra_params — dict of fixed extra query parameters
- encoding — character encoding for query and response (e.g. "cp1251")
- title_class — CSS class fragment for title elements (class-based strategy)
- author_class — CSS class fragment for author elements
- link_href_pattern — href regex to find title links (link strategy)
- brief_class — CSS class for brief record rows (brief strategy)
- img_alt — truthy: extract titles from
attributes (rusneb strategy)
- bold_text — truthy: extract author/title from
`` for titles and a CSS class for authors.
-
- Used for sites like rusneb.ru where thumbnail alt attributes carry the
- book title and a separate span contains the author.
-
- Args:
- html: Decoded HTML response.
- years: Year strings found in the full HTML (used positionally).
- cfg: Plugin config dict (reads ``author_class``).
-
- Returns:
- Up to three CandidateRecord dicts.
- """
- titles = _img_alts(html)
- authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
- return [
- CandidateRecord(
- source=self.plugin_id,
- title=title,
- author=authors[i] if i < len(authors) else "",
- year=years[i] if i < len(years) else "",
- isbn="",
- publisher="",
- )
- for i, title in enumerate(titles)
- ]
-
- def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
- titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
- authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
- return [
- CandidateRecord(
- source=self.plugin_id,
- title=title.strip(),
- author=authors[i].strip() if i < len(authors) else "",
- year=years[i] if i < len(years) else "",
- isbn="",
- publisher="",
- )
- for i, title in enumerate(titles)
- ]
-
- def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
- href_pat = cfg.get("link_href_pattern", r"")
- titles = re.findall(rf']+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})', html)[:3]
- authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
- return [
- CandidateRecord(
- source=self.plugin_id,
- title=title.strip(),
- author=authors[i].strip() if i < len(authors) else "",
- year=years[i] if i < len(years) else "",
- isbn="",
- publisher="",
- )
- for i, title in enumerate(titles)
- ]
-
- def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
- titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
- return [
- CandidateRecord(
- source=self.plugin_id,
- title=t.strip(),
- author="",
- year=years[i] if i < len(years) else "",
- isbn="",
- publisher="",
- )
- for i, t in enumerate(titles)
- ]
+ raise NotImplementedError
diff --git a/src/plugins/archives/rusneb.py b/src/plugins/archives/rusneb.py
new file mode 100644
index 0000000..4829a36
--- /dev/null
+++ b/src/plugins/archives/rusneb.py
@@ -0,0 +1,64 @@
+"""НЭБ (rusneb.ru) archive search plugin."""
+
+import re
+
+import httpx
+
+from models import CandidateRecord
+
+from .html_scraper import HtmlScraperPlugin, YEAR_RE, cls_inner_texts, img_alts
+
+_URL = "https://rusneb.ru/search/"
+_DOMAIN = "rusneb.ru"
+_AUTHOR_CLASS = "search-list__item_subtext"
+
+# Each search result is a
`` attributes within search result list
+ items and authors from ``.search-list__item_subtext`` spans. Years are
+ extracted per list item to avoid picking up unrelated page-level dates.
+ """
+
+ def search(self, query: str) -> list[CandidateRecord]:
+ """Search НЭБ for books matching query.
+
+ Args:
+ query: Free-text search string.
+
+ Returns:
+ Up to three CandidateRecord dicts with source, title, author, year,
+ isbn, and publisher fields.
+ """
+ self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+ r = httpx.get(_URL, params={"q": query}, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+ html = r.text
+
+ out: list[CandidateRecord] = []
+ for item_html in _ITEM_RE.findall(html):
+ alts = img_alts(item_html)
+ if not alts:
+ continue
+ authors = cls_inner_texts(item_html, _AUTHOR_CLASS, 3, 80)
+ year_m = YEAR_RE.search(item_html)
+ out.append(
+ CandidateRecord(
+ source=self.plugin_id,
+ title=alts[0],
+ author=authors[0] if authors else "",
+ year=year_m.group(0) if year_m else "",
+ isbn="",
+ publisher="",
+ )
+ )
+ if len(out) == 3:
+ break
+ return out
diff --git a/src/plugins/archives/shpl.py b/src/plugins/archives/shpl.py
new file mode 100644
index 0000000..9ff2d58
--- /dev/null
+++ b/src/plugins/archives/shpl.py
@@ -0,0 +1,63 @@
+"""ШПИЛ archive search plugin.
+
+Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin
+produces no results. The class is retained so the configuration entry can
+be re-enabled if the endpoint is restored.
+"""
+
+import re
+
+import httpx
+
+from models import CandidateRecord
+
+from .html_scraper import YEAR_RE, HtmlScraperPlugin
+
+_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
+_DOMAIN = "www.shpl.ru"
+_EXTRA_PARAMS: dict[str, str] = {
+ "C21COM": "S",
+ "I21DBN": "BIBL",
+ "P21DBN": "BIBL",
+ "S21FMT": "briefWebRus",
+ "Z21ID": "",
+}
+
+_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<')
+
+
+class ShplPlugin(HtmlScraperPlugin):
+ """Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека).
+
+ Extracts brief record entries from elements with class ``brief``.
+ The remote IRBIS64 CGI endpoint is currently offline (HTTP 404).
+ """
+
+ def search(self, query: str) -> list[CandidateRecord]:
+ """Search ШПИЛ for books matching query.
+
+ Args:
+ query: Free-text search string.
+
+ Returns:
+ Up to three CandidateRecord dicts with source, title, author, year,
+ isbn, and publisher fields.
+ """
+ self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+ params: dict[str, str] = dict(_EXTRA_PARAMS)
+ params["S21ALL"] = query
+ r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+ html = r.text
+ years = YEAR_RE.findall(html)
+ titles = _BRIEF_RE.findall(html)[:3]
+ return [
+ CandidateRecord(
+ source=self.plugin_id,
+ title=t.strip(),
+ author="",
+ year=years[i] if i < len(years) else "",
+ isbn="",
+ publisher="",
+ )
+ for i, t in enumerate(titles)
+ ]
diff --git a/tests/test_archives.py b/tests/test_archives.py
index c380ead..e932c6c 100644
--- a/tests/test_archives.py
+++ b/tests/test_archives.py
@@ -7,12 +7,16 @@ Run with: pytest tests/ -m network
Skip with: pytest tests/ -m "not network" (default in presubmit)
"""
+import re
+
import pytest
from models import CandidateRecord
-from plugins.archives.html_scraper import HtmlScraperPlugin
+from plugins.archives.alib import AlibPlugin
from plugins.archives.openlibrary import OpenLibraryPlugin
from plugins.archives.rsl import RSLPlugin
+from plugins.archives.rusneb import RusnebPlugin
+from plugins.archives.shpl import ShplPlugin
from plugins.archives.sru_catalog import SRUCatalogPlugin
from plugins.rate_limiter import RateLimiter
@@ -21,6 +25,8 @@ pytestmark = pytest.mark.network
_RL = RateLimiter()
_TIMEOUT = 15
+_YEAR_PAT = re.compile(r"^\d{4}$")
+
def _titles(results: list[CandidateRecord]) -> list[str]:
return [r["title"] for r in results]
@@ -30,6 +36,10 @@ def _authors(results: list[CandidateRecord]) -> list[str]:
return [r["author"] for r in results]
+def _years(results: list[CandidateRecord]) -> list[str]:
+ return [r["year"] for r in results]
+
+
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
"""Return True if any result title contains fragment (case-insensitive)."""
low = fragment.lower()
@@ -42,6 +52,11 @@ def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
return any(low in r["author"].lower() for r in results)
+def _valid_year(year: str) -> bool:
+ """Return True if year is a 4-digit string or empty."""
+ return year == "" or bool(_YEAR_PAT.match(year))
+
+
# ── OpenLibrary ───────────────────────────────────────────────────────────────
@@ -61,6 +76,10 @@ def test_openlibrary_war_and_peace() -> None:
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
# OpenLibrary stores authors in their original language; accept both forms.
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
+ assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+ # OpenLibrary returns isbn and publisher from its JSON API.
+ assert all(isinstance(r["isbn"], str) for r in results)
+ assert all(isinstance(r["publisher"], str) for r in results)
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
@@ -80,57 +99,56 @@ def test_rsl_voina_i_mir() -> None:
assert results, "RSL returned no results"
assert all(r["source"] == "rsl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
+ assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+ assert all(r["isbn"] == "" for r in results)
+ assert all(r["publisher"] == "" for r in results)
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
def test_rusneb_voina_i_mir() -> None:
- plugin = HtmlScraperPlugin(
+ plugin = RusnebPlugin(
plugin_id="rusneb",
name="НЭБ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=True,
timeout=_TIMEOUT,
- config={
- "url": "https://rusneb.ru/search/",
- "search_param": "q",
- "img_alt": True,
- "author_class": "search-list__item_subtext",
- },
+ config={},
)
results = plugin.search("Война и мир Толстой")
assert results, "НЭБ returned no results"
assert all(r["source"] == "rusneb" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
+ assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+ assert all(r["isbn"] == "" for r in results)
+ assert all(r["publisher"] == "" for r in results)
# ── Alib ─────────────────────────────────────────────────────────────────────
def test_alib_voina_i_mir() -> None:
- plugin = HtmlScraperPlugin(
+ plugin = AlibPlugin(
plugin_id="alib_web",
name="Alib (web)",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
- config={
- "url": "https://www.alib.ru/find3.php4",
- "search_param": "tfind",
- "extra_params": {"f": "5", "s": "0"},
- "encoding": "cp1251",
- "bold_text": True,
- },
+ config={},
)
results = plugin.search("Война и мир Толстой")
assert results, "Alib returned no results"
assert all(r["source"] == "alib_web" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
assert _has_author(results, "толст"), f"authors={_authors(results)}"
+ # Alib entries always include a publication year in the bibliographic text.
+ assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}"
+ assert all(r["isbn"] == "" for r in results)
+ assert all(r["publisher"] == "" for r in results)
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
@@ -155,6 +173,9 @@ def test_nlr_voina_i_mir() -> None:
assert results, "НЛР returned no results"
assert all(r["source"] == "nlr" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
+ assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+ assert all(r["isbn"] == "" for r in results)
+ assert all(r["publisher"] == "" for r in results)
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
@@ -163,27 +184,19 @@ def test_nlr_voina_i_mir() -> None:
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
def test_shpl_voina_i_mir() -> None:
- plugin = HtmlScraperPlugin(
+ plugin = ShplPlugin(
plugin_id="shpl",
name="ШПИЛ",
rate_limiter=_RL,
rate_limit_seconds=0,
auto_queue=False,
timeout=_TIMEOUT,
- config={
- "url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
- "search_param": "S21ALL",
- "extra_params": {
- "C21COM": "S",
- "I21DBN": "BIBL",
- "P21DBN": "BIBL",
- "S21FMT": "briefWebRus",
- "Z21ID": "",
- },
- "brief_class": "brief",
- },
+ config={},
)
results = plugin.search("Война и мир")
assert results, "ШПИЛ returned no results"
assert all(r["source"] == "shpl" for r in results)
assert _has_title(results, "война"), f"titles={_titles(results)}"
+ assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
+ assert all(r["isbn"] == "" for r in results)
+ assert all(r["publisher"] == "" for r in results)