media info - read pdf
This commit is contained in:
@@ -136,10 +136,13 @@ _TEXT_PREVIEW_EXTENSIONS = {
|
|||||||
".xml", ".yaml", ".yml",
|
".xml", ".yaml", ".yml",
|
||||||
}
|
}
|
||||||
_IMAGE_PREVIEW_EXTENSIONS = {".avif", ".bmp", ".gif", ".jpeg", ".jpg", ".png", ".webp"}
|
_IMAGE_PREVIEW_EXTENSIONS = {".avif", ".bmp", ".gif", ".jpeg", ".jpg", ".png", ".webp"}
|
||||||
|
_PDF_PREVIEW_EXTENSIONS = {".pdf"}
|
||||||
_MEDIA_INFO_SAMPLE_BYTES = 32 * 1024 * 1024
|
_MEDIA_INFO_SAMPLE_BYTES = 32 * 1024 * 1024
|
||||||
_MEDIA_INFO_CHUNK_BYTES = 1024 * 1024
|
_MEDIA_INFO_CHUNK_BYTES = 1024 * 1024
|
||||||
_TEXT_PREVIEW_BYTES = 512 * 1024
|
_TEXT_PREVIEW_BYTES = 512 * 1024
|
||||||
_IMAGE_PREVIEW_BYTES = 8 * 1024 * 1024
|
_IMAGE_PREVIEW_BYTES = 8 * 1024 * 1024
|
||||||
|
_PDF_TEXT_BYTES = 16 * 1024 * 1024
|
||||||
|
_PDF_TEXT_PAGES = 10
|
||||||
_MEDIA_INFO_TMP_DIR = BASE_DIR / "data" / "media-info-samples"
|
_MEDIA_INFO_TMP_DIR = BASE_DIR / "data" / "media-info-samples"
|
||||||
|
|
||||||
|
|
||||||
@@ -162,6 +165,11 @@ def _image_preview_supported(path: str) -> bool:
|
|||||||
return _file_extension(path) in _IMAGE_PREVIEW_EXTENSIONS
|
return _file_extension(path) in _IMAGE_PREVIEW_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_preview_supported(path: str) -> bool:
|
||||||
|
# Note: PDF previews use pypdf for bounded text extraction and do not require system tools such as poppler.
|
||||||
|
return _file_extension(path) in _PDF_PREVIEW_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
def _media_info_sample_suffix(source_path: str) -> str:
|
def _media_info_sample_suffix(source_path: str) -> str:
|
||||||
suffix = LocalPath(str(source_path or "")).suffix.lower()
|
suffix = LocalPath(str(source_path or "")).suffix.lower()
|
||||||
if suffix and len(suffix) <= 16 and all(ch.isalnum() or ch in ".-_" for ch in suffix):
|
if suffix and len(suffix) <= 16 and all(ch.isalnum() or ch in ".-_" for ch in suffix):
|
||||||
@@ -288,6 +296,117 @@ def _image_file_preview(profile: dict, selected: dict, remote_path: str, max_byt
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_imports():
|
||||||
|
# Note: pypdf is imported lazily so non-PDF previews do not depend on it at request time.
|
||||||
|
import sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pypdf import PdfReader
|
||||||
|
return PdfReader
|
||||||
|
except ModuleNotFoundError as exc:
|
||||||
|
missing = str(getattr(exc, "name", "") or "pypdf")
|
||||||
|
if missing.split(".", 1)[0] == "pypdf":
|
||||||
|
raise RuntimeError(
|
||||||
|
"Python package 'pypdf' is not importable in the application runtime. "
|
||||||
|
"Install it inside the pyTorrent virtualenv and restart the service: "
|
||||||
|
"/opt/pyTorrent/venv/bin/pip install -r /opt/pyTorrent/requirements.txt && systemctl restart pytorrent. "
|
||||||
|
f"Runtime: {sys.executable}."
|
||||||
|
) from exc
|
||||||
|
raise RuntimeError(
|
||||||
|
f"pypdf is installed, but one of its Python dependencies is missing: {missing}. "
|
||||||
|
f"Runtime: {sys.executable}."
|
||||||
|
) from exc
|
||||||
|
except Exception as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
"pypdf was found, but failed during import. "
|
||||||
|
f"Runtime: {sys.executable}. Details: {exc}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_file_preview(
|
||||||
|
profile: dict,
|
||||||
|
selected: dict,
|
||||||
|
remote_path: str,
|
||||||
|
max_bytes: int = _PDF_TEXT_BYTES,
|
||||||
|
max_pages: int = _PDF_TEXT_PAGES,
|
||||||
|
) -> dict:
|
||||||
|
# Note: PDF text extraction reads only bounded, reasonably sized PDF files and extracts a limited number of pages for modal UX.
|
||||||
|
size = int(selected.get("size") or 0)
|
||||||
|
result = {
|
||||||
|
**selected,
|
||||||
|
"kind": "pdf",
|
||||||
|
"parser": "pypdf",
|
||||||
|
"supported": True,
|
||||||
|
"sample_bytes": 0,
|
||||||
|
"sample_limit": int(max_bytes),
|
||||||
|
"page_limit": int(max_pages),
|
||||||
|
"partial": False,
|
||||||
|
"summary": {},
|
||||||
|
"fields": [
|
||||||
|
{"key": "Type", "value": "PDF text preview"},
|
||||||
|
{"key": "Read limit", "value": human_size(max_bytes)},
|
||||||
|
{"key": "Page limit", "value": str(max_pages)},
|
||||||
|
],
|
||||||
|
"raw": [],
|
||||||
|
"text": "",
|
||||||
|
}
|
||||||
|
if size > max_bytes:
|
||||||
|
result.update({
|
||||||
|
"too_large": True,
|
||||||
|
"error": f"PDF text extraction is limited to {human_size(max_bytes)}. Download the file to read the full PDF.",
|
||||||
|
})
|
||||||
|
return result
|
||||||
|
|
||||||
|
PdfReader = _pdf_imports()
|
||||||
|
data = _read_file_prefix(profile, remote_path, max_bytes)
|
||||||
|
result["sample_bytes"] = len(data)
|
||||||
|
try:
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
reader = PdfReader(BytesIO(data))
|
||||||
|
if getattr(reader, "is_encrypted", False):
|
||||||
|
try:
|
||||||
|
reader.decrypt("")
|
||||||
|
except Exception:
|
||||||
|
result.update({"error": "This PDF is encrypted and cannot be read without a password."})
|
||||||
|
return result
|
||||||
|
pages = list(reader.pages)
|
||||||
|
page_count = len(pages)
|
||||||
|
extracted = []
|
||||||
|
for page_number, page in enumerate(pages[:max_pages], start=1):
|
||||||
|
try:
|
||||||
|
page_text = page.extract_text() or ""
|
||||||
|
except Exception as exc:
|
||||||
|
page_text = f"[Page {page_number}: text extraction failed: {exc}]"
|
||||||
|
if page_text.strip():
|
||||||
|
extracted.append(f"--- Page {page_number} ---\n{page_text.strip()}")
|
||||||
|
text = "\n\n".join(extracted).strip()
|
||||||
|
result.update({
|
||||||
|
"text": text,
|
||||||
|
"page_count": page_count,
|
||||||
|
"extracted_pages": min(page_count, max_pages),
|
||||||
|
"partial": page_count > max_pages,
|
||||||
|
"summary": {
|
||||||
|
"duration": None,
|
||||||
|
"bit_rate": human_size(size) if size else None,
|
||||||
|
"compression": "PDF",
|
||||||
|
"producer": f"{min(page_count, max_pages)} / {page_count} page(s)",
|
||||||
|
"creation_date": None,
|
||||||
|
},
|
||||||
|
"fields": result["fields"] + [
|
||||||
|
{"key": "PDF size", "value": human_size(size)},
|
||||||
|
{"key": "Pages", "value": str(page_count)},
|
||||||
|
{"key": "Extracted pages", "value": str(min(page_count, max_pages))},
|
||||||
|
],
|
||||||
|
})
|
||||||
|
if not text:
|
||||||
|
result["error"] = "No readable text was found in the selected PDF pages. The file may be scanned or image-based."
|
||||||
|
return result
|
||||||
|
except Exception as exc:
|
||||||
|
result.update({"error": f"Unable to read PDF text: {exc}"})
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _media_info_temp_sample(profile: dict, source_path: str, max_bytes: int) -> tuple[str, int]:
|
def _media_info_temp_sample(profile: dict, source_path: str, max_bytes: int) -> tuple[str, int]:
|
||||||
# Note: hachoir needs a seekable file, so this writes a bounded sample into the app data directory instead of loading whole media into RAM.
|
# Note: hachoir needs a seekable file, so this writes a bounded sample into the app data directory instead of loading whole media into RAM.
|
||||||
import tempfile
|
import tempfile
|
||||||
@@ -422,6 +541,8 @@ def torrent_file_media_info(profile: dict, torrent_hash: str, index: int, max_by
|
|||||||
return _text_file_preview(profile, selected, remote_path)
|
return _text_file_preview(profile, selected, remote_path)
|
||||||
if _image_preview_supported(name):
|
if _image_preview_supported(name):
|
||||||
return _image_file_preview(profile, selected, remote_path)
|
return _image_file_preview(profile, selected, remote_path)
|
||||||
|
if _pdf_preview_supported(name):
|
||||||
|
return _pdf_file_preview(profile, selected, remote_path)
|
||||||
|
|
||||||
supported = _media_info_supported(name)
|
supported = _media_info_supported(name)
|
||||||
result = {
|
result = {
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -4565,3 +4565,57 @@ body,
|
|||||||
width: 9rem;
|
width: 9rem;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.media-info-pdf-preview {
|
||||||
|
background: var(--bs-body-bg);
|
||||||
|
border: 1px solid var(--bs-border-color);
|
||||||
|
border-radius: 0.75rem;
|
||||||
|
color: var(--bs-body-color);
|
||||||
|
font-family: ui-serif, Georgia, Cambria, "Times New Roman", Times, serif;
|
||||||
|
font-size: 0.95rem;
|
||||||
|
line-height: 1.5;
|
||||||
|
margin: 0 0 1rem;
|
||||||
|
max-height: 62vh;
|
||||||
|
overflow: auto;
|
||||||
|
padding: 1rem;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.media-info-pdf-empty {
|
||||||
|
align-items: center;
|
||||||
|
background: var(--bs-tertiary-bg);
|
||||||
|
border: 1px solid var(--bs-border-color);
|
||||||
|
border-radius: 0.9rem;
|
||||||
|
display: grid;
|
||||||
|
gap: 1rem;
|
||||||
|
grid-template-columns: auto minmax(0, 1fr) auto;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
padding: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.media-info-pdf-empty > i {
|
||||||
|
color: var(--bs-danger);
|
||||||
|
font-size: 1.8rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.media-info-pdf-empty b,
|
||||||
|
.media-info-pdf-empty span {
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
|
.media-info-pdf-empty span {
|
||||||
|
color: var(--bs-secondary-color);
|
||||||
|
margin-top: 0.15rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.media-info-download-row {
|
||||||
|
display: flex;
|
||||||
|
justify-content: flex-end;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 560px) {
|
||||||
|
.media-info-pdf-empty {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,3 +6,4 @@ psutil>=5.9
|
|||||||
simple-websocket>=1.0
|
simple-websocket>=1.0
|
||||||
gunicorn>=22.0
|
gunicorn>=22.0
|
||||||
hachoir>=3.3
|
hachoir>=3.3
|
||||||
|
pypdf>=4.3
|
||||||
|
|||||||
Reference in New Issue
Block a user