bookshelf/src/plugins/archives/html_scraper.py

"""Base class and shared HTML parsing utilities for archive scraper plugins."""

import re
from typing import Any

from models import CandidateRecord

from ..rate_limiter import RateLimiter

YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)


def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
    """Extract text content from elements whose class contains cls_frag.

    Strips inner HTML tags and normalises whitespace, so elements like
    ``<span class='…'><b>Name</b> I.N.</span>`` work correctly.

    Args:
        html: Raw HTML string to search.
        cls_frag: Substring that must appear in the class attribute value.
        min_len: Minimum length of extracted text to keep.
        max_len: Maximum length of extracted text to keep.

    Returns:
        Up to three non-empty text strings in document order.
    """
    raw = re.findall(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>(.*?)</', html, re.DOTALL)
    out: list[str] = []
    for m in raw:
        text = re.sub(r"<[^>]+>", "", m)
        text = re.sub(r"\s+", " ", text).strip()
        if min_len <= len(text) <= max_len:
            out.append(text)
        if len(out) == 3:
            break
    return out


def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
    """Extract non-empty alt attributes from <img> tags, normalising whitespace.

    Args:
        html: Raw HTML string to search.
        min_len: Minimum character length to include.
        max_len: Maximum character length to include.

    Returns:
        Up to three non-empty, whitespace-normalised alt strings.
    """
    alts = re.findall(r'<img[^>]+alt=[\'"]([^\'"]+)[\'"]', html)
    out: list[str] = []
    for a in alts:
        text = re.sub(r"\s+", " ", a).strip()
        if min_len <= len(text) <= max_len:
            out.append(text)
        if len(out) == 3:
            break
    return out


class HtmlScraperPlugin:
    """Base class for HTML-scraping archive plugins.

    Handles common initialisation; subclasses implement search() with
    site-specific hardcoded logic.  The config dict is accepted for
    registry compatibility but is not used by the base class; all scraping
    details are hardcoded in the subclass.
    """

    category = "archive_searchers"

    def __init__(
        self,
        plugin_id: str,
        name: str,
        rate_limiter: RateLimiter,
        rate_limit_seconds: float,
        auto_queue: bool,
        timeout: int,
        config: dict[str, Any],
    ):
        self.plugin_id = plugin_id
        self.name = name
        self._rl = rate_limiter
        self.rate_limit_seconds = rate_limit_seconds
        self.auto_queue = auto_queue
        self.timeout = timeout

    def search(self, query: str) -> list[CandidateRecord]:
        """Search for books matching query.

        Args:
            query: Free-text search string.

        Returns:
            Up to three CandidateRecord dicts with source, title, author, year,
            isbn, and publisher fields.
        """
        raise NotImplementedError