better pdf ux
This commit is contained in:
@@ -166,7 +166,7 @@ def _image_preview_supported(path: str) -> bool:
|
||||
|
||||
|
||||
def _pdf_preview_supported(path: str) -> bool:
|
||||
# Note: PDF previews use pypdf for bounded text extraction and do not require system tools such as poppler.
|
||||
# Note: PDF previews are rendered inline by the browser so image-heavy books keep their page layout.
|
||||
return _file_extension(path) in _PDF_PREVIEW_EXTENSIONS
|
||||
|
||||
|
||||
@@ -330,81 +330,32 @@ def _pdf_file_preview(
|
||||
max_bytes: int = _PDF_TEXT_BYTES,
|
||||
max_pages: int = _PDF_TEXT_PAGES,
|
||||
) -> dict:
|
||||
# Note: PDF text extraction reads only bounded, reasonably sized PDF files and extracts a limited number of pages for modal UX.
|
||||
# Note: The modal keeps a metadata payload here, while the frontend streams the real PDF through the existing file download route in inline mode.
|
||||
size = int(selected.get("size") or 0)
|
||||
result = {
|
||||
return {
|
||||
**selected,
|
||||
"kind": "pdf",
|
||||
"parser": "pypdf",
|
||||
"parser": "browser-pdf-viewer",
|
||||
"supported": True,
|
||||
"sample_bytes": 0,
|
||||
"sample_limit": int(max_bytes),
|
||||
"page_limit": int(max_pages),
|
||||
"sample_limit": 0,
|
||||
"page_limit": 0,
|
||||
"partial": False,
|
||||
"summary": {},
|
||||
"summary": {
|
||||
"duration": None,
|
||||
"bit_rate": human_size(size) if size else None,
|
||||
"compression": "PDF",
|
||||
"producer": "Browser inline preview",
|
||||
"creation_date": None,
|
||||
},
|
||||
"fields": [
|
||||
{"key": "Type", "value": "PDF text preview"},
|
||||
{"key": "Read limit", "value": human_size(max_bytes)},
|
||||
{"key": "Page limit", "value": str(max_pages)},
|
||||
{"key": "Type", "value": "PDF inline preview"},
|
||||
{"key": "PDF size", "value": human_size(size)},
|
||||
{"key": "Preview mode", "value": "Browser PDF renderer"},
|
||||
],
|
||||
"raw": [],
|
||||
"text": "",
|
||||
}
|
||||
if size > max_bytes:
|
||||
result.update({
|
||||
"too_large": True,
|
||||
"error": f"PDF text extraction is limited to {human_size(max_bytes)}. Download the file to read the full PDF.",
|
||||
})
|
||||
return result
|
||||
|
||||
PdfReader = _pdf_imports()
|
||||
data = _read_file_prefix(profile, remote_path, max_bytes)
|
||||
result["sample_bytes"] = len(data)
|
||||
try:
|
||||
from io import BytesIO
|
||||
|
||||
reader = PdfReader(BytesIO(data))
|
||||
if getattr(reader, "is_encrypted", False):
|
||||
try:
|
||||
reader.decrypt("")
|
||||
except Exception:
|
||||
result.update({"error": "This PDF is encrypted and cannot be read without a password."})
|
||||
return result
|
||||
pages = list(reader.pages)
|
||||
page_count = len(pages)
|
||||
extracted = []
|
||||
for page_number, page in enumerate(pages[:max_pages], start=1):
|
||||
try:
|
||||
page_text = page.extract_text() or ""
|
||||
except Exception as exc:
|
||||
page_text = f"[Page {page_number}: text extraction failed: {exc}]"
|
||||
if page_text.strip():
|
||||
extracted.append(f"--- Page {page_number} ---\n{page_text.strip()}")
|
||||
text = "\n\n".join(extracted).strip()
|
||||
result.update({
|
||||
"text": text,
|
||||
"page_count": page_count,
|
||||
"extracted_pages": min(page_count, max_pages),
|
||||
"partial": page_count > max_pages,
|
||||
"summary": {
|
||||
"duration": None,
|
||||
"bit_rate": human_size(size) if size else None,
|
||||
"compression": "PDF",
|
||||
"producer": f"{min(page_count, max_pages)} / {page_count} page(s)",
|
||||
"creation_date": None,
|
||||
},
|
||||
"fields": result["fields"] + [
|
||||
{"key": "PDF size", "value": human_size(size)},
|
||||
{"key": "Pages", "value": str(page_count)},
|
||||
{"key": "Extracted pages", "value": str(min(page_count, max_pages))},
|
||||
],
|
||||
})
|
||||
if not text:
|
||||
result["error"] = "No readable text was found in the selected PDF pages. The file may be scanned or image-based."
|
||||
return result
|
||||
except Exception as exc:
|
||||
result.update({"error": f"Unable to read PDF text: {exc}"})
|
||||
return result
|
||||
|
||||
|
||||
def _media_info_temp_sample(profile: dict, source_path: str, max_bytes: int) -> tuple[str, int]:
|
||||
|
||||
Reference in New Issue
Block a user