Files
pyTorrent/pytorrent/services/tracker_cache.py
Mateusz Gruszczyński 96e17d4b63 favicons
2026-05-08 19:59:38 +02:00

374 lines
15 KiB
Python

from __future__ import annotations
import json
import mimetypes
import re
import time
import threading
import urllib.error
import urllib.parse
import urllib.request
from html.parser import HTMLParser
from pathlib import Path
from ..config import BASE_DIR
from ..db import connect, utcnow
TRACKER_CACHE_TTL_SECONDS = 7 * 24 * 60 * 60
FAVICON_CACHE_TTL_SECONDS = 7 * 24 * 60 * 60
TRACKER_SCAN_LIMIT = 80
FAVICON_DIR = BASE_DIR / "data" / "tracker_favicons"
PUBLIC_FAVICON_BASE = "/static/tracker_favicons"
_TRACKER_SCAN_LOCKS: dict[int, threading.Lock] = {}
_TRACKER_SCAN_LOCKS_GUARD = threading.Lock()
class _IconParser(HTMLParser):
def __init__(self):
super().__init__()
self.icons: list[str] = []
def handle_starttag(self, tag: str, attrs):
if tag.lower() != "link":
return
data = {str(k).lower(): str(v or "") for k, v in attrs}
rel = data.get("rel", "").lower()
href = data.get("href", "").strip()
if href and any(part in rel.split() for part in ("icon", "shortcut", "apple-touch-icon")):
self.icons.append(href)
def _now_epoch() -> float:
return time.time()
def tracker_domain(url: str) -> str:
raw = str(url or "").strip()
if not raw:
return ""
parsed = urllib.parse.urlparse(raw if "://" in raw else f"http://{raw}")
host = (parsed.hostname or "").lower().strip(".")
if host.startswith("www."):
host = host[4:]
return host
def _root_domain(domain: str) -> str:
parts = [p for p in str(domain or "").lower().strip(".").split(".") if p]
if len(parts) <= 2:
return ".".join(parts)
if len(parts[-1]) == 2 and len(parts[-2]) <= 3 and len(parts) >= 3:
return ".".join(parts[-3:])
return ".".join(parts[-2:])
def _safe_filename(domain: str) -> str:
return re.sub(r"[^a-z0-9_.-]+", "_", domain.lower()).strip("._") or "tracker"
def _read_cached(profile_id: int, hashes: list[str], ttl: int) -> tuple[dict[str, list[dict]], set[str]]:
if not hashes:
return {}, set()
now = _now_epoch()
cached: dict[str, list[dict]] = {}
fresh: set[str] = set()
with connect() as conn:
for start in range(0, len(hashes), 900):
chunk = hashes[start:start + 900]
placeholders = ",".join("?" for _ in chunk)
rows = conn.execute(
f"SELECT torrent_hash, trackers_json, updated_epoch FROM tracker_summary_cache WHERE profile_id=? AND torrent_hash IN ({placeholders})",
(profile_id, *chunk),
).fetchall()
for row in rows:
h = str(row.get("torrent_hash") or "")
try:
items = json.loads(row.get("trackers_json") or "[]")
except Exception:
items = []
cached[h] = items if isinstance(items, list) else []
if now - float(row.get("updated_epoch") or 0) < ttl:
fresh.add(h)
return cached, fresh
def _store(profile_id: int, torrent_hash: str, trackers: list[dict]) -> None:
now = utcnow()
epoch = _now_epoch()
compact = []
seen = set()
for item in trackers:
domain = tracker_domain(str(item.get("url") or item.get("domain") or "")) or str(item.get("domain") or "")
if not domain or domain in seen:
continue
seen.add(domain)
compact.append({"domain": domain, "url": str(item.get("url") or "")})
with connect() as conn:
conn.execute(
"""
INSERT INTO tracker_summary_cache(profile_id, torrent_hash, trackers_json, updated_at, updated_epoch)
VALUES(?, ?, ?, ?, ?)
ON CONFLICT(profile_id, torrent_hash) DO UPDATE SET
trackers_json=excluded.trackers_json,
updated_at=excluded.updated_at,
updated_epoch=excluded.updated_epoch
""",
(profile_id, torrent_hash, json.dumps(compact), now, epoch),
)
def summary(profile: dict, hashes: list[str], loader, scan_limit: int = TRACKER_SCAN_LIMIT, include_favicons: bool = False) -> dict:
"""Build tracker sidebar data from disk cache and refresh a small batch per request."""
# Note: Tracker data is cached per torrent hash, so huge rTorrent libraries are never scanned in one UI request.
profile_id = int(profile.get("id") or 0)
clean_hashes = [str(h or "").strip() for h in hashes if str(h or "").strip()]
cached, fresh = _read_cached(profile_id, clean_hashes, TRACKER_CACHE_TTL_SECONDS)
missing = [h for h in clean_hashes if h not in fresh]
errors: list[dict] = []
scanned_now = 0
for h in missing[:max(0, int(scan_limit or 0))]:
try:
trackers = loader(h)
_store(profile_id, h, trackers)
cached[h] = [{"domain": tracker_domain(t.get("url") or t.get("domain") or ""), "url": str(t.get("url") or "")} for t in trackers]
fresh.add(h)
scanned_now += 1
except Exception as exc:
errors.append({"hash": h, "error": str(exc)})
by_hash: dict[str, list[dict]] = {}
counts: dict[str, dict] = {}
for h in clean_hashes:
items = []
seen = set()
for item in cached.get(h, []):
domain = tracker_domain(str(item.get("url") or item.get("domain") or "")) or str(item.get("domain") or "")
if not domain or domain in seen:
continue
seen.add(domain)
row = {"domain": domain, "url": str(item.get("url") or "")}
items.append(row)
bucket = counts.setdefault(domain, {"domain": domain, "url": row["url"], "count": 0})
bucket["count"] += 1
if not bucket.get("url") and row["url"]:
bucket["url"] = row["url"]
by_hash[h] = items
trackers = sorted(counts.values(), key=lambda x: (-int(x.get("count") or 0), str(x.get("domain") or "")))
if include_favicons:
# Note: Summary returns only already cached static favicon URLs; network favicon discovery stays outside the hot tracker count path.
for item in trackers:
item["favicon_url"] = favicon_public_url(str(item.get("domain") or ""), enabled=True, create=False)
pending = max(0, len([h for h in clean_hashes if h not in fresh]))
return {"hashes": by_hash, "trackers": trackers, "errors": errors[:25], "scanned": len(clean_hashes), "scanned_now": scanned_now, "pending": pending, "cached": len(clean_hashes) - pending}
def _scan_lock(profile_id: int) -> threading.Lock:
with _TRACKER_SCAN_LOCKS_GUARD:
if profile_id not in _TRACKER_SCAN_LOCKS:
_TRACKER_SCAN_LOCKS[profile_id] = threading.Lock()
return _TRACKER_SCAN_LOCKS[profile_id]
def warm_summary_cache(profile: dict, hashes: list[str], loader, batch_size: int = TRACKER_SCAN_LIMIT) -> bool:
"""Start a non-blocking tracker cache warmup for large libraries."""
# Note: Tracker cache warming runs in one background thread per profile, so F5 returns cached data immediately instead of waiting for rTorrent scans.
profile_id = int(profile.get("id") or 0)
clean_hashes = [str(h or "").strip() for h in hashes if str(h or "").strip()]
if not profile_id or not clean_hashes:
return False
lock = _scan_lock(profile_id)
if lock.locked():
return False
def _worker():
if not lock.acquire(blocking=False):
return
try:
while True:
result = summary(profile, clean_hashes, loader, scan_limit=max(1, int(batch_size or TRACKER_SCAN_LIMIT)), include_favicons=False)
if int(result.get("pending") or 0) <= 0 or int(result.get("scanned_now") or 0) <= 0:
break
time.sleep(0.05)
finally:
lock.release()
threading.Thread(target=_worker, name=f"tracker-cache-warm-{profile_id}", daemon=True).start()
return True
def favicon_public_url(domain: str, enabled: bool = True, create: bool = False, force: bool = False) -> str:
"""Return the static URL for a cached tracker favicon, optionally creating or refreshing it first."""
# Note: Favicon files stay in data/tracker_favicons, but the browser loads them via the static/tracker_favicons symlink.
clean = tracker_domain(domain)
if not enabled or not clean:
return ""
if create:
favicon_path(clean, enabled=True, force=force)
cached = _cached_favicon(clean)
now = _now_epoch()
if not cached or now - float(cached.get("updated_epoch") or 0) >= FAVICON_CACHE_TTL_SECONDS:
return ""
path = Path(str(cached.get("file_path") or ""))
if not path.exists() or not path.is_file():
return ""
try:
rel = path.resolve().relative_to(FAVICON_DIR.resolve())
except Exception:
rel = Path(path.name)
return f"{PUBLIC_FAVICON_BASE}/{urllib.parse.quote(str(rel).replace(chr(92), '/'))}"
def _fetch(url: str, limit: int = 262144) -> tuple[bytes, str, str]:
req = urllib.request.Request(url, headers={"User-Agent": "pyTorrent/1.0 favicon-cache"})
with urllib.request.urlopen(req, timeout=5) as resp:
data = resp.read(limit + 1)
if len(data) > limit:
data = data[:limit]
content_type = str(resp.headers.get("Content-Type") or "").split(";", 1)[0].strip().lower()
final_url = str(resp.geturl() or url)
return data, content_type, final_url
def _is_icon(data: bytes, content_type: str, url: str) -> bool:
"""Validate that downloaded bytes are a browser-readable image, not only an image-like HTTP header."""
# Note: Some trackers serve a broken /favicon.ico with image/vnd.microsoft.icon; pyTorrent now validates bytes before caching it.
if not data or len(data) < 16:
return False
head = data[:32]
lower = data[:512].lstrip().lower()
if head.startswith(b"\x00\x00\x01\x00") or head.startswith(b"\x00\x00\x02\x00"):
try:
count = int.from_bytes(data[4:6], "little")
except Exception:
count = 0
return 0 < count <= 256 and len(data) >= 6 + (16 * count)
if head.startswith(b"\x89PNG\r\n\x1a\n"):
return True
if head.startswith(b"\xff\xd8\xff"):
return True
if head.startswith((b"GIF87a", b"GIF89a")):
return True
if head.startswith(b"RIFF") and data[8:12] == b"WEBP":
return True
if lower.startswith(b"<svg") or b"<svg" in lower[:256]:
return True
ctype = content_type.lower()
if ctype in {"image/svg+xml"}:
return b"<svg" in lower[:512]
return False
def _favicon_candidates(domain: str) -> list[str]:
host = tracker_domain(domain)
root = _root_domain(host)
candidates = []
for h in [host, root]:
if h:
candidates.extend([f"https://{h}/favicon.ico", f"http://{h}/favicon.ico"])
return list(dict.fromkeys(candidates))
def _html_icon_candidates(domain: str) -> list[str]:
host = tracker_domain(domain)
root = _root_domain(host)
urls = []
for h in [host, root]:
if not h:
continue
for scheme in ("https", "http"):
base = f"{scheme}://{h}/"
try:
data, ctype, final_url = _fetch(base, limit=524288)
except Exception:
continue
if "html" not in ctype and b"<html" not in data[:2048].lower() and b"<link" not in data.lower():
continue
parser = _IconParser()
try:
parser.feed(data.decode("utf-8", errors="ignore"))
except Exception:
continue
for href in parser.icons:
urls.append(urllib.parse.urljoin(final_url, href))
return list(dict.fromkeys(urls))
def _cached_favicon(domain: str):
clean = tracker_domain(domain)
if not clean:
return None
with connect() as conn:
return conn.execute("SELECT * FROM tracker_favicon_cache WHERE domain=?", (clean,)).fetchone()
def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tuple[Path | None, str | None]:
clean = tracker_domain(domain)
if not enabled or not clean:
return None, None
cached = _cached_favicon(clean)
now = _now_epoch()
if cached and not force and now - float(cached.get("updated_epoch") or 0) < FAVICON_CACHE_TTL_SECONDS:
path = Path(str(cached.get("file_path") or ""))
mime = str(cached.get("mime_type") or mimetypes.guess_type(path.name)[0] or "image/x-icon")
if path.exists() and path.is_file():
try:
if _is_icon(path.read_bytes()[:524288], mime, str(cached.get("source_url") or path.name)):
return path, mime
except Exception:
pass
if cached.get("error"):
return None, None
# Note: Favicon lookup prefers HTML <link rel="icon"> over generic /favicon.ico, because some trackers serve a broken default icon there.
FAVICON_DIR.mkdir(parents=True, exist_ok=True)
errors = []
candidates = _html_icon_candidates(clean) + _favicon_candidates(clean)
candidates = list(dict.fromkeys(candidates))
checked_html = False
idx = 0
while idx < len(candidates):
url = candidates[idx]
idx += 1
try:
data, ctype, final_url = _fetch(url, limit=524288)
if not _is_icon(data, ctype, final_url):
continue
ext = Path(urllib.parse.urlparse(final_url).path).suffix.lower() or mimetypes.guess_extension(ctype) or ".ico"
if ext not in {".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp"}:
ext = ".ico"
path = FAVICON_DIR / f"{_safe_filename(clean)}{ext}"
path.write_bytes(data)
mime = ctype if ctype.startswith("image/") else (mimetypes.guess_type(path.name)[0] or "image/x-icon")
with connect() as conn:
conn.execute(
"""
INSERT INTO tracker_favicon_cache(domain, source_url, file_path, mime_type, updated_at, updated_epoch, error)
VALUES(?, ?, ?, ?, ?, ?, NULL)
ON CONFLICT(domain) DO UPDATE SET
source_url=excluded.source_url,
file_path=excluded.file_path,
mime_type=excluded.mime_type,
updated_at=excluded.updated_at,
updated_epoch=excluded.updated_epoch,
error=NULL
""",
(clean, final_url, str(path), mime, utcnow(), now),
)
return path, mime
except Exception as exc:
errors.append(f"{url}: {exc}")
if idx >= len(candidates) and not checked_html:
checked_html = True
candidates.extend([u for u in _html_icon_candidates(clean) if u not in candidates])
with connect() as conn:
conn.execute(
"""
INSERT INTO tracker_favicon_cache(domain, source_url, file_path, mime_type, updated_at, updated_epoch, error)
VALUES(?, '', '', '', ?, ?, ?)
ON CONFLICT(domain) DO UPDATE SET
updated_at=excluded.updated_at,
updated_epoch=excluded.updated_epoch,
error=excluded.error
""",
(clean, utcnow(), now, "; ".join(errors[-3:]) or "favicon not found"),
)
return None, None