media info - read pdf

2026-05-21 10:39:19 +02:00
parent c2948ea277
commit 9142590c79
4 changed files with 177 additions and 1 deletions
--- a/pytorrent/services/rtorrent/files.py
+++ b/pytorrent/services/rtorrent/files.py
@@ -136,10 +136,13 @@ _TEXT_PREVIEW_EXTENSIONS = {
    ".xml", ".yaml", ".yml",
 }
 _IMAGE_PREVIEW_EXTENSIONS = {".avif", ".bmp", ".gif", ".jpeg", ".jpg", ".png", ".webp"}
 _PDF_PREVIEW_EXTENSIONS = {".pdf"}
 _MEDIA_INFO_SAMPLE_BYTES = 32 * 1024 * 1024
 _MEDIA_INFO_CHUNK_BYTES = 1024 * 1024
 _TEXT_PREVIEW_BYTES = 512 * 1024
 _IMAGE_PREVIEW_BYTES = 8 * 1024 * 1024
 _PDF_TEXT_BYTES = 16 * 1024 * 1024
 _PDF_TEXT_PAGES = 10
 _MEDIA_INFO_TMP_DIR = BASE_DIR / "data" / "media-info-samples"
@@ -162,6 +165,11 @@ def _image_preview_supported(path: str) -> bool:
    return _file_extension(path) in _IMAGE_PREVIEW_EXTENSIONS
 def _pdf_preview_supported(path: str) -> bool:
    # Note: PDF previews use pypdf for bounded text extraction and do not require system tools such as poppler.
    return _file_extension(path) in _PDF_PREVIEW_EXTENSIONS
 def _media_info_sample_suffix(source_path: str) -> str:
    suffix = LocalPath(str(source_path or "")).suffix.lower()
    if suffix and len(suffix) <= 16 and all(ch.isalnum() or ch in ".-_" for ch in suffix):
@@ -288,6 +296,117 @@ def _image_file_preview(profile: dict, selected: dict, remote_path: str, max_byt
    return result
 def _pdf_imports():
    # Note: pypdf is imported lazily so non-PDF previews do not depend on it at request time.
    import sys
    try:
        from pypdf import PdfReader
        return PdfReader
    except ModuleNotFoundError as exc:
        missing = str(getattr(exc, "name", "") or "pypdf")
        if missing.split(".", 1)[0] == "pypdf":
            raise RuntimeError(
                "Python package 'pypdf' is not importable in the application runtime. "
                "Install it inside the pyTorrent virtualenv and restart the service: "
                "/opt/pyTorrent/venv/bin/pip install -r /opt/pyTorrent/requirements.txt && systemctl restart pytorrent. "
                f"Runtime: {sys.executable}."
            ) from exc
        raise RuntimeError(
            f"pypdf is installed, but one of its Python dependencies is missing: {missing}. "
            f"Runtime: {sys.executable}."
        ) from exc
    except Exception as exc:
        raise RuntimeError(
            "pypdf was found, but failed during import. "
            f"Runtime: {sys.executable}. Details: {exc}"
        ) from exc
 def _pdf_file_preview(
    profile: dict,
    selected: dict,
    remote_path: str,
    max_bytes: int = _PDF_TEXT_BYTES,
    max_pages: int = _PDF_TEXT_PAGES,
 ) -> dict:
    # Note: PDF text extraction reads only bounded, reasonably sized PDF files and extracts a limited number of pages for modal UX.
    size = int(selected.get("size") or 0)
    result = {
        **selected,
        "kind": "pdf",
        "parser": "pypdf",
        "supported": True,
        "sample_bytes": 0,
        "sample_limit": int(max_bytes),
        "page_limit": int(max_pages),
        "partial": False,
        "summary": {},
        "fields": [
            {"key": "Type", "value": "PDF text preview"},
            {"key": "Read limit", "value": human_size(max_bytes)},
            {"key": "Page limit", "value": str(max_pages)},
        ],
        "raw": [],
        "text": "",
    }
    if size > max_bytes:
        result.update({
            "too_large": True,
            "error": f"PDF text extraction is limited to {human_size(max_bytes)}. Download the file to read the full PDF.",
        })
        return result
    PdfReader = _pdf_imports()
    data = _read_file_prefix(profile, remote_path, max_bytes)
    result["sample_bytes"] = len(data)
    try:
        from io import BytesIO
        reader = PdfReader(BytesIO(data))
        if getattr(reader, "is_encrypted", False):
            try:
                reader.decrypt("")
            except Exception:
                result.update({"error": "This PDF is encrypted and cannot be read without a password."})
                return result
        pages = list(reader.pages)
        page_count = len(pages)
        extracted = []
        for page_number, page in enumerate(pages[:max_pages], start=1):
            try:
                page_text = page.extract_text() or ""
            except Exception as exc:
                page_text = f"[Page {page_number}: text extraction failed: {exc}]"
            if page_text.strip():
                extracted.append(f"--- Page {page_number} ---\n{page_text.strip()}")
        text = "\n\n".join(extracted).strip()
        result.update({
            "text": text,
            "page_count": page_count,
            "extracted_pages": min(page_count, max_pages),
            "partial": page_count > max_pages,
            "summary": {
                "duration": None,
                "bit_rate": human_size(size) if size else None,
                "compression": "PDF",
                "producer": f"{min(page_count, max_pages)} / {page_count} page(s)",
                "creation_date": None,
            },
            "fields": result["fields"] + [
                {"key": "PDF size", "value": human_size(size)},
                {"key": "Pages", "value": str(page_count)},
                {"key": "Extracted pages", "value": str(min(page_count, max_pages))},
            ],
        })
        if not text:
            result["error"] = "No readable text was found in the selected PDF pages. The file may be scanned or image-based."
        return result
    except Exception as exc:
        result.update({"error": f"Unable to read PDF text: {exc}"})
        return result
 def _media_info_temp_sample(profile: dict, source_path: str, max_bytes: int) -> tuple[str, int]:
    # Note: hachoir needs a seekable file, so this writes a bounded sample into the app data directory instead of loading whole media into RAM.
    import tempfile
@@ -422,6 +541,8 @@ def torrent_file_media_info(profile: dict, torrent_hash: str, index: int, max_by
        return _text_file_preview(profile, selected, remote_path)
    if _image_preview_supported(name):
        return _image_file_preview(profile, selected, remote_path)
    if _pdf_preview_supported(name):
        return _pdf_file_preview(profile, selected, remote_path)
    supported = _media_info_supported(name)
    result = {
--- a/pytorrent/static/js/torrentDetails.js
+++ b/pytorrent/static/js/torrentDetails.js
--- a/pytorrent/static/styles.css
+++ b/pytorrent/static/styles.css
@@ -4565,3 +4565,57 @@ body,
    width: 9rem;
  }
 }
 .media-info-pdf-preview {
  background: var(--bs-body-bg);
  border: 1px solid var(--bs-border-color);
  border-radius: 0.75rem;
  color: var(--bs-body-color);
  font-family: ui-serif, Georgia, Cambria, "Times New Roman", Times, serif;
  font-size: 0.95rem;
  line-height: 1.5;
  margin: 0 0 1rem;
  max-height: 62vh;
  overflow: auto;
  padding: 1rem;
  white-space: pre-wrap;
 }
 .media-info-pdf-empty {
  align-items: center;
  background: var(--bs-tertiary-bg);
  border: 1px solid var(--bs-border-color);
  border-radius: 0.9rem;
  display: grid;
  gap: 1rem;
  grid-template-columns: auto minmax(0, 1fr) auto;
  margin-bottom: 1rem;
  padding: 1rem;
 }
 .media-info-pdf-empty > i {
  color: var(--bs-danger);
  font-size: 1.8rem;
 }
 .media-info-pdf-empty b,
 .media-info-pdf-empty span {
  display: block;
 }
 .media-info-pdf-empty span {
  color: var(--bs-secondary-color);
  margin-top: 0.15rem;
 }
 .media-info-download-row {
  display: flex;
  justify-content: flex-end;
  margin-bottom: 1rem;
 }
@media (max-width: 560px) {
  .media-info-pdf-empty {
    grid-template-columns: 1fr;
  }
 }
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ psutil>=5.9
 simple-websocket>=1.0
 gunicorn>=22.0
 hachoir>=3.3
 pypdf>=4.3