This commit is contained in:
Mateusz Gruszczyński
2026-05-08 20:27:05 +02:00
parent 4eda325a64
commit 51090a0eec

View File

@@ -58,7 +58,9 @@ def _root_domain(domain: str) -> str:
parts = [p for p in str(domain or "").lower().strip(".").split(".") if p] parts = [p for p in str(domain or "").lower().strip(".").split(".") if p]
if len(parts) <= 2: if len(parts) <= 2:
return ".".join(parts) return ".".join(parts)
if len(parts[-1]) == 2 and len(parts[-2]) <= 3 and len(parts) >= 3: # Note: Tracker favicon discovery needs the real main site first; for t.pte.nu that is pte.nu, not t.pte.nu.
known_second_level_suffixes = {"co", "com", "net", "org", "gov", "edu", "ac"}
if len(parts[-1]) == 2 and parts[-2] in known_second_level_suffixes and len(parts) >= 3:
return ".".join(parts[-3:]) return ".".join(parts[-3:])
return ".".join(parts[-2:]) return ".".join(parts[-2:])
@@ -314,10 +316,17 @@ def _extract_icon_hrefs(html: str) -> list[str]:
def _tracker_icon_hosts(domain: str) -> list[str]: def _tracker_icon_hosts(domain: str) -> list[str]:
host = tracker_domain(domain) host = tracker_domain(domain)
root = _root_domain(host) root = _root_domain(host)
# Note: Only probe the exact tracker host and the registrable root domain; CDN/static hosts are used only when HTML explicitly points to them. # Note: Direct favicon fallback checks the tracker host first, then the main domain.
return [h for h in dict.fromkeys([host, root]) if h] return [h for h in dict.fromkeys([host, root]) if h]
def _tracker_html_hosts(domain: str) -> list[str]:
host = tracker_domain(domain)
root = _root_domain(host)
# Note: HTML discovery checks the main site first, because tracker announce hosts often return text/plain.
return [h for h in dict.fromkeys([root, host]) if h]
def _favicon_candidates(domain: str) -> list[str]: def _favicon_candidates(domain: str) -> list[str]:
candidates = [] candidates = []
for h in _tracker_icon_hosts(domain): for h in _tracker_icon_hosts(domain):
@@ -327,7 +336,7 @@ def _favicon_candidates(domain: str) -> list[str]:
def _html_icon_candidates(domain: str, errors: list[str] | None = None) -> list[str]: def _html_icon_candidates(domain: str, errors: list[str] | None = None) -> list[str]:
urls = [] urls = []
for h in _tracker_icon_hosts(domain): for h in _tracker_html_hosts(domain):
for scheme in ("https", "http"): for scheme in ("https", "http"):
base = f"{scheme}://{h}/" base = f"{scheme}://{h}/"
try: try:
@@ -377,7 +386,7 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl
pass pass
if cached.get("error"): if cached.get("error"):
return None, None return None, None
# Note: Favicon lookup prefers HTML <link rel="icon"> over generic /favicon.ico, because some trackers serve a broken default icon there. # Note: Favicon lookup checks the main-domain HTML first, then tracker HTML, then direct /favicon.ico fallbacks.
FAVICON_DIR.mkdir(parents=True, exist_ok=True) FAVICON_DIR.mkdir(parents=True, exist_ok=True)
errors = [] errors = []
candidates = _html_icon_candidates(clean, errors) + _favicon_candidates(clean) candidates = _html_icon_candidates(clean, errors) + _favicon_candidates(clean)