bookshelf/src/plugins/ai_compat/text_recognizer.py

"""Text recognizer plugin — spine image → raw text + structured fields.

Input: book spine image.
Output: {"raw_text": "...", "title": "...", "author": "...", "year": "...",
         "publisher": "...", "other": "..."}
  raw_text — all visible text verbatim, line-break separated.
  other fields — VLM interpretation of raw_text.
Result added to books.candidates and books.raw_text.
"""

from models import AIConfig, TextRecognizeResult

from ._client import AIClient


class TextRecognizerPlugin:
    """Reads text from a book spine image using a VLM."""

    category = "text_recognizers"
    OUTPUT_FORMAT = (
        '{"raw_text": "The Great Gatsby\\nF. Scott Fitzgerald\\nScribner", '
        '"title": "The Great Gatsby", "author": "F. Scott Fitzgerald", '
        '"year": "", "publisher": "Scribner", "other": ""}'
    )

    def __init__(
        self,
        plugin_id: str,
        name: str,
        ai_config: AIConfig,
        prompt_text: str,
        auto_queue: bool,
        rate_limit_seconds: float,
    ):
        self.plugin_id = plugin_id
        self.name = name
        self.auto_queue = auto_queue
        self.rate_limit_seconds = rate_limit_seconds
        self._client = AIClient(ai_config, self.OUTPUT_FORMAT)
        self._prompt_text = prompt_text

    def recognize(self, image_b64: str, image_mime: str) -> TextRecognizeResult:
        """Returns TextRecognizeResult with raw_text, title, author, year, publisher, other."""
        raw = self._client.call(self._prompt_text, [(image_b64, image_mime)])
        return TextRecognizeResult(
            raw_text=str(raw.get("raw_text") or ""),
            title=str(raw.get("title") or ""),
            author=str(raw.get("author") or ""),
            year=str(raw.get("year") or ""),
            publisher=str(raw.get("publisher") or ""),
            other=str(raw.get("other") or ""),
        )

    @property
    def model(self) -> str:
        return self._client.cfg["model"]

    @property
    def max_image_px(self) -> int:
        return self._client.cfg["max_image_px"]