favicons

2026-05-08 20:23:38 +02:00
parent 2bc24610d4
commit 4eda325a64
1 changed files with 38 additions and 25 deletions
--- a/pytorrent/services/tracker_cache.py
+++ b/pytorrent/services/tracker_cache.py
@@ -277,8 +277,17 @@ def _is_icon(data: bytes, content_type: str, url: str) -> bool:



+def _attr_value(tag: str, name: str) -> str:
+    # Note: Accept quoted and unquoted HTML attributes so favicon discovery works with compact/minified tracker pages.
+    match = re.search(rf"\b{name}\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S)
+    if match:
+        return match.group(2).strip()
+    match = re.search(rf"\b{name}\s*=\s*([^\s>]+)", tag, re.I | re.S)
+    return match.group(1).strip().strip("'\"") if match else ""
+
+
 def _extract_icon_hrefs(html: str) -> list[str]:
-    # Note: Regex fallback catches real-world tags like <link rel='shortcut icon' href='...'> even when HTMLParser skips malformed markup.
+    # Note: Read any <link rel=...icon... href=...> order, including shortcut icon and relative CDN paths.
    hrefs: list[str] = []
    parser = _IconParser()
    try:
@@ -286,45 +295,51 @@ def _extract_icon_hrefs(html: str) -> list[str]:
        hrefs.extend(parser.icons)
    except Exception:
        pass
-    for match in re.finditer(r"<link\b[^>]*>", html, re.I):
+    for match in re.finditer(r"<link\b[^>]*>", html, re.I | re.S):
        tag = match.group(0)
-        rel = re.search(r"\brel\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S)
-        href = re.search(r"\bhref\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S)
-        if rel and href and "icon" in rel.group(2).lower():
-            hrefs.append(href.group(2).strip())
+        rel = _attr_value(tag, "rel").lower()
+        href = _attr_value(tag, "href")
+        if href and "icon" in rel:
+            hrefs.append(href)
    clean = []
    seen = set()
    for href in hrefs:
+        href = str(href or "").strip()
        if href and href not in seen:
            seen.add(href)
            clean.append(href)
    return clean

-def _favicon_candidates(domain: str) -> list[str]:
+
+def _tracker_icon_hosts(domain: str) -> list[str]:
    host = tracker_domain(domain)
    root = _root_domain(host)
+    # Note: Only probe the exact tracker host and the registrable root domain; CDN/static hosts are used only when HTML explicitly points to them.
+    return [h for h in dict.fromkeys([host, root]) if h]
+
+
+def _favicon_candidates(domain: str) -> list[str]:
    candidates = []
-    # Note: Besides host/root favicon.ico, try common CDN/static hostnames after HTML lookup for trackers that publish icons there.
-    for h in [host, root, f"cdn.{root}" if root else "", f"static.{root}" if root else "", f"www.{root}" if root else ""]:
-        if h:
+    for h in _tracker_icon_hosts(domain):
        candidates.extend([f"https://{h}/favicon.ico", f"http://{h}/favicon.ico"])
    return list(dict.fromkeys(candidates))


-def _html_icon_candidates(domain: str) -> list[str]:
-    host = tracker_domain(domain)
-    root = _root_domain(host)
+def _html_icon_candidates(domain: str, errors: list[str] | None = None) -> list[str]:
    urls = []
-    for h in [host, root]:
-        if not h:
-            continue
+    for h in _tracker_icon_hosts(domain):
        for scheme in ("https", "http"):
            base = f"{scheme}://{h}/"
            try:
                data, ctype, final_url = _fetch(base, limit=524288)
-            except Exception:
+            except Exception as exc:
+                if errors is not None:
+                    errors.append(f"{base}: {exc}")
                continue
-            if "html" not in ctype and b"<html" not in data[:2048].lower() and b"<link" not in data.lower():
+            lower = data[:4096].lower()
+            if "html" not in ctype and b"<html" not in lower and b"<link" not in data.lower():
+                if errors is not None:
+                    errors.append(f"{base}: response is not html ({ctype or 'unknown content-type'})")
                continue
            html = data.decode("utf-8", errors="ignore")
            for href in _extract_icon_hrefs(html):
@@ -365,9 +380,8 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl
    # Note: Favicon lookup prefers HTML <link rel="icon"> over generic /favicon.ico, because some trackers serve a broken default icon there.
    FAVICON_DIR.mkdir(parents=True, exist_ok=True)
    errors = []
-    candidates = _html_icon_candidates(clean) + _favicon_candidates(clean)
+    candidates = _html_icon_candidates(clean, errors) + _favicon_candidates(clean)
    candidates = list(dict.fromkeys(candidates))
-    checked_html = False
    idx = 0
    while idx < len(candidates):
        url = candidates[idx]
@@ -375,6 +389,7 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl
        try:
            data, ctype, final_url = _fetch(url, limit=524288)
            if not _is_icon(data, ctype, final_url):
+                errors.append(f"{url}: invalid icon ({ctype or 'unknown content-type'}, {len(data)} bytes)")
                continue
            ext = Path(urllib.parse.urlparse(final_url).path).suffix.lower() or mimetypes.guess_extension(ctype) or ".ico"
            if ext not in {".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp"}:
@@ -400,9 +415,7 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl
            return path, mime
        except Exception as exc:
            errors.append(f"{url}: {exc}")
-        if idx >= len(candidates) and not checked_html:
-            checked_html = True
-            candidates.extend([u for u in _html_icon_candidates(clean) if u not in candidates])
+        # HTML is checked once before direct /favicon.ico probes; do not guess cdn/static/www hosts unless HTML points there.
    with connect() as conn:
        conn.execute(
            """
@@ -413,6 +426,6 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl
              updated_epoch=excluded.updated_epoch,
              error=excluded.error
            """,
-            (clean, utcnow(), now, "; ".join(errors[-3:]) or "favicon not found"),
+            (clean, utcnow(), now, "; ".join(errors[-8:]) or "favicon not found"),
        )
    return None, None