diff --git a/pytorrent/services/tracker_cache.py b/pytorrent/services/tracker_cache.py index 1ee9328..2c61380 100644 --- a/pytorrent/services/tracker_cache.py +++ b/pytorrent/services/tracker_cache.py @@ -277,8 +277,17 @@ def _is_icon(data: bytes, content_type: str, url: str) -> bool: +def _attr_value(tag: str, name: str) -> str: + # Note: Accept quoted and unquoted HTML attributes so favicon discovery works with compact/minified tracker pages. + match = re.search(rf"\b{name}\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S) + if match: + return match.group(2).strip() + match = re.search(rf"\b{name}\s*=\s*([^\s>]+)", tag, re.I | re.S) + return match.group(1).strip().strip("'\"") if match else "" + + def _extract_icon_hrefs(html: str) -> list[str]: - # Note: Regex fallback catches real-world tags like even when HTMLParser skips malformed markup. + # Note: Read any order, including shortcut icon and relative CDN paths. hrefs: list[str] = [] parser = _IconParser() try: @@ -286,45 +295,51 @@ def _extract_icon_hrefs(html: str) -> list[str]: hrefs.extend(parser.icons) except Exception: pass - for match in re.finditer(r"]*>", html, re.I): + for match in re.finditer(r"]*>", html, re.I | re.S): tag = match.group(0) - rel = re.search(r"\brel\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S) - href = re.search(r"\bhref\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S) - if rel and href and "icon" in rel.group(2).lower(): - hrefs.append(href.group(2).strip()) + rel = _attr_value(tag, "rel").lower() + href = _attr_value(tag, "href") + if href and "icon" in rel: + hrefs.append(href) clean = [] seen = set() for href in hrefs: + href = str(href or "").strip() if href and href not in seen: seen.add(href) clean.append(href) return clean -def _favicon_candidates(domain: str) -> list[str]: + +def _tracker_icon_hosts(domain: str) -> list[str]: host = tracker_domain(domain) root = _root_domain(host) + # Note: Only probe the exact tracker host and the registrable root domain; CDN/static hosts are used only when HTML explicitly points to them. + return [h for h in dict.fromkeys([host, root]) if h] + + +def _favicon_candidates(domain: str) -> list[str]: candidates = [] - # Note: Besides host/root favicon.ico, try common CDN/static hostnames after HTML lookup for trackers that publish icons there. - for h in [host, root, f"cdn.{root}" if root else "", f"static.{root}" if root else "", f"www.{root}" if root else ""]: - if h: - candidates.extend([f"https://{h}/favicon.ico", f"http://{h}/favicon.ico"]) + for h in _tracker_icon_hosts(domain): + candidates.extend([f"https://{h}/favicon.ico", f"http://{h}/favicon.ico"]) return list(dict.fromkeys(candidates)) -def _html_icon_candidates(domain: str) -> list[str]: - host = tracker_domain(domain) - root = _root_domain(host) +def _html_icon_candidates(domain: str, errors: list[str] | None = None) -> list[str]: urls = [] - for h in [host, root]: - if not h: - continue + for h in _tracker_icon_hosts(domain): for scheme in ("https", "http"): base = f"{scheme}://{h}/" try: data, ctype, final_url = _fetch(base, limit=524288) - except Exception: + except Exception as exc: + if errors is not None: + errors.append(f"{base}: {exc}") continue - if "html" not in ctype and b" tupl # Note: Favicon lookup prefers HTML over generic /favicon.ico, because some trackers serve a broken default icon there. FAVICON_DIR.mkdir(parents=True, exist_ok=True) errors = [] - candidates = _html_icon_candidates(clean) + _favicon_candidates(clean) + candidates = _html_icon_candidates(clean, errors) + _favicon_candidates(clean) candidates = list(dict.fromkeys(candidates)) - checked_html = False idx = 0 while idx < len(candidates): url = candidates[idx] @@ -375,6 +389,7 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl try: data, ctype, final_url = _fetch(url, limit=524288) if not _is_icon(data, ctype, final_url): + errors.append(f"{url}: invalid icon ({ctype or 'unknown content-type'}, {len(data)} bytes)") continue ext = Path(urllib.parse.urlparse(final_url).path).suffix.lower() or mimetypes.guess_extension(ctype) or ".ico" if ext not in {".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp"}: @@ -400,9 +415,7 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl return path, mime except Exception as exc: errors.append(f"{url}: {exc}") - if idx >= len(candidates) and not checked_html: - checked_html = True - candidates.extend([u for u in _html_icon_candidates(clean) if u not in candidates]) + # HTML is checked once before direct /favicon.ico probes; do not guess cdn/static/www hosts unless HTML points there. with connect() as conn: conn.execute( """ @@ -413,6 +426,6 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl updated_epoch=excluded.updated_epoch, error=excluded.error """, - (clean, utcnow(), now, "; ".join(errors[-3:]) or "favicon not found"), + (clean, utcnow(), now, "; ".join(errors[-8:]) or "favicon not found"), ) return None, None