favicons
This commit is contained in:
@@ -277,8 +277,17 @@ def _is_icon(data: bytes, content_type: str, url: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _attr_value(tag: str, name: str) -> str:
|
||||||
|
# Note: Accept quoted and unquoted HTML attributes so favicon discovery works with compact/minified tracker pages.
|
||||||
|
match = re.search(rf"\b{name}\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S)
|
||||||
|
if match:
|
||||||
|
return match.group(2).strip()
|
||||||
|
match = re.search(rf"\b{name}\s*=\s*([^\s>]+)", tag, re.I | re.S)
|
||||||
|
return match.group(1).strip().strip("'\"") if match else ""
|
||||||
|
|
||||||
|
|
||||||
def _extract_icon_hrefs(html: str) -> list[str]:
|
def _extract_icon_hrefs(html: str) -> list[str]:
|
||||||
# Note: Regex fallback catches real-world tags like <link rel='shortcut icon' href='...'> even when HTMLParser skips malformed markup.
|
# Note: Read any <link rel=...icon... href=...> order, including shortcut icon and relative CDN paths.
|
||||||
hrefs: list[str] = []
|
hrefs: list[str] = []
|
||||||
parser = _IconParser()
|
parser = _IconParser()
|
||||||
try:
|
try:
|
||||||
@@ -286,45 +295,51 @@ def _extract_icon_hrefs(html: str) -> list[str]:
|
|||||||
hrefs.extend(parser.icons)
|
hrefs.extend(parser.icons)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
for match in re.finditer(r"<link\b[^>]*>", html, re.I):
|
for match in re.finditer(r"<link\b[^>]*>", html, re.I | re.S):
|
||||||
tag = match.group(0)
|
tag = match.group(0)
|
||||||
rel = re.search(r"\brel\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S)
|
rel = _attr_value(tag, "rel").lower()
|
||||||
href = re.search(r"\bhref\s*=\s*(['\"])(.*?)\1", tag, re.I | re.S)
|
href = _attr_value(tag, "href")
|
||||||
if rel and href and "icon" in rel.group(2).lower():
|
if href and "icon" in rel:
|
||||||
hrefs.append(href.group(2).strip())
|
hrefs.append(href)
|
||||||
clean = []
|
clean = []
|
||||||
seen = set()
|
seen = set()
|
||||||
for href in hrefs:
|
for href in hrefs:
|
||||||
|
href = str(href or "").strip()
|
||||||
if href and href not in seen:
|
if href and href not in seen:
|
||||||
seen.add(href)
|
seen.add(href)
|
||||||
clean.append(href)
|
clean.append(href)
|
||||||
return clean
|
return clean
|
||||||
|
|
||||||
def _favicon_candidates(domain: str) -> list[str]:
|
|
||||||
|
def _tracker_icon_hosts(domain: str) -> list[str]:
|
||||||
host = tracker_domain(domain)
|
host = tracker_domain(domain)
|
||||||
root = _root_domain(host)
|
root = _root_domain(host)
|
||||||
|
# Note: Only probe the exact tracker host and the registrable root domain; CDN/static hosts are used only when HTML explicitly points to them.
|
||||||
|
return [h for h in dict.fromkeys([host, root]) if h]
|
||||||
|
|
||||||
|
|
||||||
|
def _favicon_candidates(domain: str) -> list[str]:
|
||||||
candidates = []
|
candidates = []
|
||||||
# Note: Besides host/root favicon.ico, try common CDN/static hostnames after HTML lookup for trackers that publish icons there.
|
for h in _tracker_icon_hosts(domain):
|
||||||
for h in [host, root, f"cdn.{root}" if root else "", f"static.{root}" if root else "", f"www.{root}" if root else ""]:
|
candidates.extend([f"https://{h}/favicon.ico", f"http://{h}/favicon.ico"])
|
||||||
if h:
|
|
||||||
candidates.extend([f"https://{h}/favicon.ico", f"http://{h}/favicon.ico"])
|
|
||||||
return list(dict.fromkeys(candidates))
|
return list(dict.fromkeys(candidates))
|
||||||
|
|
||||||
|
|
||||||
def _html_icon_candidates(domain: str) -> list[str]:
|
def _html_icon_candidates(domain: str, errors: list[str] | None = None) -> list[str]:
|
||||||
host = tracker_domain(domain)
|
|
||||||
root = _root_domain(host)
|
|
||||||
urls = []
|
urls = []
|
||||||
for h in [host, root]:
|
for h in _tracker_icon_hosts(domain):
|
||||||
if not h:
|
|
||||||
continue
|
|
||||||
for scheme in ("https", "http"):
|
for scheme in ("https", "http"):
|
||||||
base = f"{scheme}://{h}/"
|
base = f"{scheme}://{h}/"
|
||||||
try:
|
try:
|
||||||
data, ctype, final_url = _fetch(base, limit=524288)
|
data, ctype, final_url = _fetch(base, limit=524288)
|
||||||
except Exception:
|
except Exception as exc:
|
||||||
|
if errors is not None:
|
||||||
|
errors.append(f"{base}: {exc}")
|
||||||
continue
|
continue
|
||||||
if "html" not in ctype and b"<html" not in data[:2048].lower() and b"<link" not in data.lower():
|
lower = data[:4096].lower()
|
||||||
|
if "html" not in ctype and b"<html" not in lower and b"<link" not in data.lower():
|
||||||
|
if errors is not None:
|
||||||
|
errors.append(f"{base}: response is not html ({ctype or 'unknown content-type'})")
|
||||||
continue
|
continue
|
||||||
html = data.decode("utf-8", errors="ignore")
|
html = data.decode("utf-8", errors="ignore")
|
||||||
for href in _extract_icon_hrefs(html):
|
for href in _extract_icon_hrefs(html):
|
||||||
@@ -365,9 +380,8 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl
|
|||||||
# Note: Favicon lookup prefers HTML <link rel="icon"> over generic /favicon.ico, because some trackers serve a broken default icon there.
|
# Note: Favicon lookup prefers HTML <link rel="icon"> over generic /favicon.ico, because some trackers serve a broken default icon there.
|
||||||
FAVICON_DIR.mkdir(parents=True, exist_ok=True)
|
FAVICON_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
errors = []
|
errors = []
|
||||||
candidates = _html_icon_candidates(clean) + _favicon_candidates(clean)
|
candidates = _html_icon_candidates(clean, errors) + _favicon_candidates(clean)
|
||||||
candidates = list(dict.fromkeys(candidates))
|
candidates = list(dict.fromkeys(candidates))
|
||||||
checked_html = False
|
|
||||||
idx = 0
|
idx = 0
|
||||||
while idx < len(candidates):
|
while idx < len(candidates):
|
||||||
url = candidates[idx]
|
url = candidates[idx]
|
||||||
@@ -375,6 +389,7 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl
|
|||||||
try:
|
try:
|
||||||
data, ctype, final_url = _fetch(url, limit=524288)
|
data, ctype, final_url = _fetch(url, limit=524288)
|
||||||
if not _is_icon(data, ctype, final_url):
|
if not _is_icon(data, ctype, final_url):
|
||||||
|
errors.append(f"{url}: invalid icon ({ctype or 'unknown content-type'}, {len(data)} bytes)")
|
||||||
continue
|
continue
|
||||||
ext = Path(urllib.parse.urlparse(final_url).path).suffix.lower() or mimetypes.guess_extension(ctype) or ".ico"
|
ext = Path(urllib.parse.urlparse(final_url).path).suffix.lower() or mimetypes.guess_extension(ctype) or ".ico"
|
||||||
if ext not in {".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp"}:
|
if ext not in {".ico", ".png", ".jpg", ".jpeg", ".svg", ".webp"}:
|
||||||
@@ -400,9 +415,7 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl
|
|||||||
return path, mime
|
return path, mime
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"{url}: {exc}")
|
errors.append(f"{url}: {exc}")
|
||||||
if idx >= len(candidates) and not checked_html:
|
# HTML is checked once before direct /favicon.ico probes; do not guess cdn/static/www hosts unless HTML points there.
|
||||||
checked_html = True
|
|
||||||
candidates.extend([u for u in _html_icon_candidates(clean) if u not in candidates])
|
|
||||||
with connect() as conn:
|
with connect() as conn:
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"""
|
"""
|
||||||
@@ -413,6 +426,6 @@ def favicon_path(domain: str, enabled: bool = True, force: bool = False) -> tupl
|
|||||||
updated_epoch=excluded.updated_epoch,
|
updated_epoch=excluded.updated_epoch,
|
||||||
error=excluded.error
|
error=excluded.error
|
||||||
""",
|
""",
|
||||||
(clean, utcnow(), now, "; ".join(errors[-3:]) or "favicon not found"),
|
(clean, utcnow(), now, "; ".join(errors[-8:]) or "favicon not found"),
|
||||||
)
|
)
|
||||||
return None, None
|
return None, None
|
||||||
|
|||||||
Reference in New Issue
Block a user