pyTorrent/pytorrent/services/tracker_cache.py

from __future__ import annotations

import json
import mimetypes
import re
import time
import threading
import ssl
import urllib.error
import urllib.parse
import urllib.request
from html.parser import HTMLParser
from pathlib import Path

from ..config import BASE_DIR
from ..db import connect, utcnow

TRACKER_CACHE_TTL_SECONDS = 7 * 24 * 60 * 60
FAVICON_CACHE_TTL_SECONDS = 7 * 24 * 60 * 60
TRACKER_SCAN_LIMIT = 80
FAVICON_DIR = BASE_DIR / "data" / "tracker_favicons"
PUBLIC_FAVICON_BASE = "/static/tracker_favicons"
_TRACKER_SCAN_LOCKS: dict[int, threading.Lock] = {}
_TRACKER_SCAN_LOCKS_GUARD = threading.Lock()


class _IconParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.icons: list[str] = []

    def handle_starttag(self, tag: str, attrs):
        if tag.lower() != "link":
            return
        data = {str(k).lower(): str(v or "") for k, v in attrs}
        rel = re.sub(r"\s+", " ", data.get("rel", "").lower()).strip()
        href = data.get("href", "").strip()
        if href and "icon" in rel:
            self.icons.append(href)


def _now_epoch() -> float:
    return time.time()


def tracker_domain(url: str) -> str:
    raw = str(url or "").strip()
    if not raw:
        return ""
    parsed = urllib.parse.urlparse(raw if "://" in raw else f"http://{raw}")
    host = (parsed.hostname or "").lower().strip(".")
    if host.startswith("www."):
        host = host[4:]
    return host


def _root_domain(domain: str) -> str:
    parts = [p for p in str(domain or "").lower().strip(".").split(".") if p]
    if len(parts) <= 2:
        return ".".join(parts)
    if len(parts[-1]) == 2 and len(parts[-2]) <= 3 and len(parts) >= 3:
        return ".".join(parts[-3:])
    return ".".join(parts[-2:])


def _safe_filename(domain: str) -> str:
    return re.sub(r"[^a-z0-9_.-]+", "_", domain.lower()).strip("._") or "tracker"


def _read_cached(profile_id: int, hashes: list[str], ttl: int) -> tuple[dict[str, list[dict]], set[str]]:
    if not hashes:
        return {}, set()
    now = _now_epoch()
    cached: dict[str, list[dict]] = {}
    fresh: set[str] = set()
    with connect() as conn:
        for start in range(0, len(hashes), 900):
            chunk = hashes[start:start + 900]
            placeholders = ",".join("?" for _ in chunk)
            rows = conn.execute(
                f"SELECT torrent_hash, trackers_json, updated_epoch FROM tracker_summary_cache WHERE profile_id=? AND torrent_hash IN ({placeholders})",
                (profile_id, *chunk),
            ).fetchall()
            for row in rows:
                h = str(row.get("torrent_hash") or "")
                try:
                    items = json.loads(row.get("trackers_json") or "[]")
                except Exception:
                    items = []
                cached[h] = items if isinstance(items, list) else []
                if now - float(row.get("updated_epoch") or 0) < ttl:
                    fresh.add(h)
    return cached, fresh


def _store(profile_id: int, torrent_hash: str, trackers: list[dict]) -> None:
    now = utcnow()
    epoch = _now_epoch()
    compact = []
    seen = set()
    for item in trackers:
        domain = tracker_domain(str(item.get("url") or item.get("domain") or "")) or str(item.get("domain") or "")
        if not domain or domain in seen:
            continue
        seen.add(domain)
        compact.append({"domain": domain, "url": str(item.get("url") or "")})
    with connect() as conn:
        conn.execute(
            """
            INSERT INTO tracker_summary_cache(profile_id, torrent_hash, trackers_json, updated_at, updated_epoch)
            VALUES(?, ?, ?, ?, ?)
            ON CONFLICT(profile_id, torrent_hash) DO UPDATE SET
              trackers_json=excluded.trackers_json,
              updated_at=excluded.updated_at,
              updated_epoch=excluded.updated_epoch
            """,
            (profile_id, torrent_hash, json.dumps(compact), now, epoch),
        )


def summary(profile: dict, hashes: list[str], loader, scan_limit: int = TRACKER_SCAN_LIMIT, include_favicons: bool = False) -> dict:
    """Build tracker sidebar data from disk cache and refresh a small batch per request."""
    # Note: Tracker data is cached per torrent hash, so huge rTorrent libraries are never scanned in one UI request.
    profile_id = int(profile.get("id") or 0)
    clean_hashes = [str(h or "").strip() for h in hashes if str(h or "").strip()]
    cached, fresh = _read_cached(profile_id, clean_hashes, TRACKER_CACHE_TTL_SECONDS)
    missing = [h for h in clean_hashes if h not in fresh]
    errors: list[dict] = []
    scanned_now = 0
    for h in missing[:max(0, int(scan_limit or 0))]:
        try:
            trackers = loader(h)
            _store(profile_id, h, trackers)
            cached[h] = [{"domain": tracker_domain(t.get("url") or t.get("domain") or ""), "url": str(t.get("url") or "")} for t in trackers]
            fresh.add(h)
            scanned_now += 1
        except Exception as exc:
            errors.append({"hash": h, "error": str(exc)})
    by_hash: dict[str, list[dict]] = {}
    counts: dict[str, dict] = {}
    for h in clean_hashes:
        items = []
        seen = set()
        for item in cached.get(h, []):
            domain = tracker_domain(str(item.get("url") or item.get("domain") or "")) or str(item.get("domain") or "")
            if not domain or domain in seen:
                continue
            seen.add(domain)
            row = {"domain": domain, "url": str(item.get("url") or "")}
            items.append(row)
            bucket = counts.setdefault(domain, {"domain": domain, "url": row["url"], "count": 0})
            bucket["count"] += 1
            if not bucket.get("url") and row["url"]:
                bucket["url"] = row["url"]
        by_hash[h] = items
    trackers = sorted(counts.values(), key=lambda x: (-int(x.get("count") or 0), str(x.get("domain") or "")))
    if include_favicons:
        # Note: Summary returns only already cached static favicon URLs; network favicon discovery stays outside the hot tracker count path.
        for item in trackers:
            item["favicon_url"] = favicon_public_url(str(item.get("domain") or ""), enabled=True, create=False)
    pending = max(0, len([h for h in clean_hashes if h not in fresh]))
    return {"hashes": by_hash, "trackers": trackers, "errors": errors[:25], "scanned": len(clean_hashes), "scanned_now": scanned_now, "pending": pending, "cached": len(clean_hashes) - pending}


def _scan_lock(profile_id: int) -> threading.Lock:
    with _TRACKER_SCAN_LOCKS_GUARD:
        if profile_id not in _TRACKER_SCAN_LOCKS:
            _TRACKER_SCAN_LOCKS[profile_id] = threading.Lock()
        return _TRACKER_SCAN_LOCKS[profile_id]


def warm_summary_cache(profile: dict, hashes: list[str], loader, batch_size: int = TRACKER_SCAN_LIMIT) -> bool:
    """Start a non-blocking tracker cache warmup for large libraries."""
    # Note: Tracker cache warming runs in one background thread per profile, so F5 returns cached data immediately instead of waiting for rTorrent scans.
    profile_id = int(profile.get("id") or 0)
    clean_hashes = [str(h or "").strip() for h in hashes if str(h or "").strip()]
    if not profile_id or not clean_hashes:
        return False
    lock = _scan_lock(profile_id)
    if lock.locked():
        return False

    def _worker():
        if not lock.acquire(blocking=False):
            return
        try:
            while True:
                result = summary(profile, clean_hashes, loader, scan_limit=max(1, int(batch_size or TRACKER_SCAN_LIMIT)), include_favicons=False)
                if int(result.get("pending") or 0) <= 0 or int(result.get("scanned_now") or 0) <= 0:
                    break
                time.sleep(0.05)
        finally:
            lock.release()

    threading.Thread(target=_worker, name=f"tracker-cache-warm-{profile_id}", daemon=True).start()
    return True


def favicon_public_url(domain: str, enabled: bool = True, create: bool = False, force: bool = False) -> str:
    """Return the static URL for a cached tracker favicon, optionally creating or refreshing it first."""
    # Note: Favicon files stay in data/tracker_favicons, but the browser loads them via the static/tracker_favicons symlink.
    clean = tracker_domain(domain)
    if not enabled or not clean:
        return ""
    if create:
        favicon_path(clean, enabled=True, force=force)
    cached = _cached_favicon(clean)
    now = _now_epoch()
    if not cached or now - float(cached.get("updated_epoch") or 0) >= FAVICON_CACHE_TTL_SECONDS:
        return ""
    path = Path(str(cached.get("file_path") or ""))
    if not path.exists() or not path.is_file():
        return ""
    try:
        rel = path.resolve().relative_to(FAVICON_DIR.resolve())
    except Exception:
        rel = Path(path.name)
    return f"{PUBLIC_FAVICON_BASE}/{urllib.parse.quote(str(rel).replace(chr(92), '/'))}"

def _fetch(url: str, limit: int = 262144) -> tuple[bytes, str, str]:
    # Note: Favicon discovery uses browser-like headers and a certificate fallback, because tracker login pages/CDNs often reject minimal Python requests.
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": "Mozilla/5.0 (compatible; pyTorrent favicon fetcher)",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,image/*,*/*;q=0.8",
            "Connection": "close",
        },
    )

    def _read(context=None):
        with urllib.request.urlopen(req, timeout=8, context=context) as resp:
            data = resp.read(limit + 1)
            if len(data) > limit:
                data = data[:limit]
            content_type = str(resp.headers.get("Content-Type") or "").split(";", 1)[0].strip().lower()
            final_url = str(resp.geturl() or url)
            return data, content_type, final_url

    try:
        return _read()
    except urllib.error.URLError as exc:
        reason = getattr(exc, "reason", None)
        if isinstance(reason, ssl.SSLError) or "CERTIFICATE_VERIFY_FAILED" in str(exc):
            return _read(ssl._create_unverified_context())
        raise


def _is_icon(data: bytes, content_type: str, url: str) -> bool:
    """Validate that downloaded bytes are a browser-readable image, not only an image-like HTTP header."""
    # Note: Some trackers serve a broken /favicon.ico with image/vnd.microsoft.icon; pyTorrent now validates bytes before caching it.
    if not data or len(data) < 16:
        return False
    head = data[:32]
    lower = data[:512].lstrip().lower()
    if head.startswith(b"\x00\x00\x01\x00") or head.startswith(b"\x00\x00\x02\x00"):
        try:
            count = int.from_bytes(data[4:6], "little")
        except Exception:
            count = 0
        return 0 < count <= 256 and len(data) >= 6 + (16 * count)
    if head.startswith(b"\x89PNG\r\n\x1a\n"):
        return True
    if head.startswith(b"\xff\xd8\xff"):
        return True
    if head.startswith((b"GIF87a", b"GIF89a")):
        return True
    if head.startswith(b"RIFF") and data[8:12] == b"WEBP":
        return True
    if lower.startswith(b"<svg") or b"<svg" in lower[:256]:
        return True
    ctype = content_type.lower()
    if ctype in {"image/svg+xml"}:
        return b"<svg" in lower[:512]
    return False


def _extract_icon_hrefs(html: str) -> list[str]:
    # Note: Regex fallback catches real-world tags like <link rel='shortcut icon' href='...'> even when HTMLParser skips malformed markup.
    hrefs: list[str] = []
    parser = _IconParser()
    try:
        parser.feed(html)
        hrefs.extend(parser.icons)
    except Exception:
        pass
    for match in re.finditer(r"<link\b[^>]*>", html, re.I):
        tag = match.group(0)
        rel = re.search(r"\brel\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S)
        href = re.search(r"\bhref\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S)
        if rel and href and "icon" in rel.group(2).lower():
            hrefs.append(href.group(2).strip())
    clean = []
    seen = set()
    for href in hrefs:
        if href and href not in seen:
            seen.add(href)
            clean.append(href)
    return clean

def _favicon_candidates(domain: str) -> list[str]:
    host = tracker_domain(domain)
    root = _root_domain(host)
    candidates = []
    # Note: Besides host/root favicon.ico, try common CDN/static hostnames after HTML lookup for trackers that publish icons there.
    for h in [host, root, f"cdn.{root}" if root else "", f"static.{root}" if root else "", f"www.{root}" if root else ""]:
        if h:
            candidates.extend([f"https://{h}/favicon.ico", f"http://{h}/favicon.ico"])
    return list(dict.fromkeys(candidates))


def _html_icon_candidates(domain: str) -> list[str]:
    host = tracker_domain(domain)
    root = _root_domain(host)
    urls = []
    for h in [host, root]:
        if not h:
            continue
        for scheme in ("https", "http"):
            base = f"{scheme}://{h}/"
            try:
                data, ctype, final_url = _fetch(base, limit=524288)
            except Exception:
                continue
            if "html" not in ctype and b"<html" not in data[:2048].lower() and b"<link" not in data.lower():
                continue
            html = data.decode("utf-8", errors="ignore")
            for href in _extract_icon_hrefs(html):
                urls.append(urllib.parse.urljoin(final_url, href))
    return list(dict.fromkeys(urls))


def _cached_favicon(domain: str):
    clean = tracker_domain(domain)
    if not clean:
        return None
    with connect() as conn:
        return conn.execute("SELECT * FROM tracker_favicon_cache WHERE domain=?", (clean,)).fetchone()


def favicon_cache_row(domain: str):
    """Note: Expose the favicon cache row for diagnostics without duplicating SQL in routes or CLI."""
    return _cached_favicon(domain)


def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tuple[Path | None, str | None]:
    clean = tracker_domain(domain)
    if not enabled or not clean:
        return None, None
    cached = _cached_favicon(clean)
    now = _now_epoch()
    if cached and not force and now - float(cached.get("updated_epoch") or 0) < FAVICON_CACHE_TTL_SECONDS:
        path = Path(str(cached.get("file_path") or ""))
        mime = str(cached.get("mime_type") or mimetypes.guess_type(path.name)[0] or "image/x-icon")
        if path.exists() and path.is_file():
            try:
                if _is_icon(path.read_bytes()[:524288], mime, str(cached.get("source_url") or path.name)):
                    return path, mime
            except Exception:
                pass
        if cached.get("error"):
            return None, None
    # Note: Favicon lookup prefers HTML <link rel="icon"> over generic /favicon.ico, because some trackers serve a broken default icon there.
    FAVICON_DIR.mkdir(parents=True, exist_ok=True)
    errors = []
    candidates = _html_icon_candidates(clean) + _favicon_candidates(clean)
    candidates = list(dict.fromkeys(candidates))
    checked_html = False
    idx = 0
    while idx < len(candidates):
        url = candidates[idx]
        idx += 1
        try:
            data, ctype, final_url = _fetch(url, limit=524288)
            if not _is_icon(data, ctype, final_url):
                continue
            ext = Path(urllib.parse.urlparse(final_url).path).suffix.lower() or mimetypes.guess_extension(ctype) or ".ico"
            if ext not in {".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp"}:
                ext = ".ico"
            path = FAVICON_DIR / f"{_safe_filename(clean)}{ext}"
            path.write_bytes(data)
            mime = ctype if ctype.startswith("image/") else (mimetypes.guess_type(path.name)[0] or "image/x-icon")
            with connect() as conn:
                conn.execute(
                    """
                    INSERT INTO tracker_favicon_cache(domain, source_url, file_path, mime_type, updated_at, updated_epoch, error)
                    VALUES(?, ?, ?, ?, ?, ?, NULL)
                    ON CONFLICT(domain) DO UPDATE SET
                      source_url=excluded.source_url,
                      file_path=excluded.file_path,
                      mime_type=excluded.mime_type,
                      updated_at=excluded.updated_at,
                      updated_epoch=excluded.updated_epoch,
                      error=NULL
                    """,
                    (clean, final_url, str(path), mime, utcnow(), now),
                )
            return path, mime
        except Exception as exc:
            errors.append(f"{url}: {exc}")
        if idx >= len(candidates) and not checked_html:
            checked_html = True
            candidates.extend([u for u in _html_icon_candidates(clean) if u not in candidates])
    with connect() as conn:
        conn.execute(
            """
            INSERT INTO tracker_favicon_cache(domain, source_url, file_path, mime_type, updated_at, updated_epoch, error)
            VALUES(?, '', '', '', ?, ?, ?)
            ON CONFLICT(domain) DO UPDATE SET
              updated_at=excluded.updated_at,
              updated_epoch=excluded.updated_epoch,
              error=excluded.error
            """,
            (clean, utcnow(), now, "; ".join(errors[-3:]) or "favicon not found"),
        )
    return None, None