Files
bookshelf/src/plugins/archives/html_scraper.py
Petr Polezhaev fd32be729f Replace config-driven HtmlScraperPlugin with specific archive classes
Each archive scraper now has its own class with hardcoded URL and parsing
logic; config only carries auto_queue, timeout, and rate_limit_seconds.

- html_scraper: refactor to base class with public shared utilities
  (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts)
- rusneb.py (new): RusnebPlugin extracts year per list item rather than
  globally, eliminating wrong page-level dates
- alib.py (new): AlibPlugin extracts year from within each <p><b> entry
  rather than globally, fixing nonsensical year values
- shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded
  params; config type updated from html_scraper to shpl
- config: remove config: subsections from rusneb, alib_web, shpl entries;
  update type fields to rusneb, alib_web, shpl respectively
- plugins/__init__.py: register new specific types, remove html_scraper
- tests: use specific plugin classes; assert all CandidateRecord fields
  (source, title, author, year, isbn, publisher) with appropriate constraints

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 00:03:17 +03:00

102 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Base class and shared HTML parsing utilities for archive scraper plugins."""
import re
from typing import Any
from models import CandidateRecord
from ..rate_limiter import RateLimiter
YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
"""Extract text content from elements whose class contains cls_frag.
Strips inner HTML tags and normalises whitespace, so elements like
``<span class=''><b>Name</b> I.N.</span>`` work correctly.
Args:
html: Raw HTML string to search.
cls_frag: Substring that must appear in the class attribute value.
min_len: Minimum length of extracted text to keep.
max_len: Maximum length of extracted text to keep.
Returns:
Up to three non-empty text strings in document order.
"""
raw = re.findall(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>(.*?)</', html, re.DOTALL)
out: list[str] = []
for m in raw:
text = re.sub(r"<[^>]+>", "", m)
text = re.sub(r"\s+", " ", text).strip()
if min_len <= len(text) <= max_len:
out.append(text)
if len(out) == 3:
break
return out
def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
"""Extract non-empty alt attributes from <img> tags, normalising whitespace.
Args:
html: Raw HTML string to search.
min_len: Minimum character length to include.
max_len: Maximum character length to include.
Returns:
Up to three non-empty, whitespace-normalised alt strings.
"""
alts = re.findall(r'<img[^>]+alt=[\'"]([^\'"]+)[\'"]', html)
out: list[str] = []
for a in alts:
text = re.sub(r"\s+", " ", a).strip()
if min_len <= len(text) <= max_len:
out.append(text)
if len(out) == 3:
break
return out
class HtmlScraperPlugin:
"""Base class for HTML-scraping archive plugins.
Handles common initialisation; subclasses implement search() with
site-specific hardcoded logic. The config dict is accepted for
registry compatibility but is not used by the base class; all scraping
details are hardcoded in the subclass.
"""
category = "archive_searchers"
def __init__(
self,
plugin_id: str,
name: str,
rate_limiter: RateLimiter,
rate_limit_seconds: float,
auto_queue: bool,
timeout: int,
config: dict[str, Any],
):
self.plugin_id = plugin_id
self.name = name
self._rl = rate_limiter
self.rate_limit_seconds = rate_limit_seconds
self.auto_queue = auto_queue
self.timeout = timeout
def search(self, query: str) -> list[CandidateRecord]:
"""Search for books matching query.
Args:
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
raise NotImplementedError