pyTorrent/pytorrent/services/rss.py

from __future__ import annotations

import re
import time
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timezone, timedelta
from email.utils import parsedate_to_datetime
from typing import Iterable

from ..db import connect, utcnow, default_user_id
from . import rtorrent
from .workers import enqueue

RSS_FETCH_LIMIT = 2_000_000


def _parse_dt(value: str | None) -> datetime | None:
    if not value:
        return None
    try:
        return parsedate_to_datetime(value).astimezone(timezone.utc)
    except Exception:
        return None


def _item_size(item: ET.Element) -> int:
    enc = item.find("enclosure")
    if enc is not None:
        try:
            return int(enc.get("length") or 0)
        except Exception:
            return 0
    for tag in ("size", "length"):
        try:
            return int(item.findtext(tag) or 0)
        except Exception:
            pass
    return 0


def _item_category(item: ET.Element) -> str:
    values = [x.text or "" for x in item.findall("category")]
    return " ".join(values).strip()


def parse_feed(raw: bytes) -> list[dict]:
    root = ET.fromstring(raw)
    items = root.findall(".//item")
    if not items and root.tag.lower().endswith("feed"):
        items = root.findall("{http://www.w3.org/2005/Atom}entry")
    parsed: list[dict] = []
    for item in items[:200]:
        title = item.findtext("title") or item.findtext("{http://www.w3.org/2005/Atom}title") or ""
        link = item.findtext("link") or ""
        atom_link = item.find("{http://www.w3.org/2005/Atom}link")
        if atom_link is not None and atom_link.get("href"):
            link = atom_link.get("href") or link
        enc = item.find("enclosure")
        if enc is not None and enc.get("url"):
            link = enc.get("url") or link
        pub_date = item.findtext("pubDate") or item.findtext("updated") or item.findtext("{http://www.w3.org/2005/Atom}updated")
        parsed.append({
            "title": title.strip(),
            "link": str(link or "").strip(),
            "size": _item_size(item),
            "category": _item_category(item),
            "published_at": _parse_dt(pub_date).isoformat(timespec="seconds") if _parse_dt(pub_date) else None,
        })
    return parsed


def fetch_feed(url: str) -> list[dict]:
    req = urllib.request.Request(url, headers={"User-Agent": "pyTorrent RSS"})
    with urllib.request.urlopen(req, timeout=12) as res:
        raw = res.read(RSS_FETCH_LIMIT)
    return parse_feed(raw)


def _season_episode(title: str) -> tuple[int | None, int | None]:
    match = re.search(r"S(\d{1,2})E(\d{1,3})", title or "", re.I)
    if match:
        return int(match.group(1)), int(match.group(2))
    match = re.search(r"\b(\d{1,2})x(\d{1,3})\b", title or "", re.I)
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None


def matches_rule(rule: dict, item: dict) -> tuple[bool, str]:
    title = str(item.get("title") or "")
    haystack = " ".join([title, str(item.get("category") or "")])
    pattern = str(rule.get("pattern") or ".*")
    exclude = str(rule.get("exclude_pattern") or "").strip()
    try:
        if pattern and not re.search(pattern, haystack, re.I):
            return False, "include pattern did not match"
        if exclude and re.search(exclude, haystack, re.I):
            return False, "exclude pattern matched"
    except re.error as exc:
        return False, f"invalid regex: {exc}"
    size_mb = (int(item.get("size") or 0) / 1024 / 1024) if item.get("size") else 0
    min_size = int(rule.get("min_size_mb") or 0)
    max_size = int(rule.get("max_size_mb") or 0)
    if min_size and size_mb and size_mb < min_size:
        return False, "item is below minimum size"
    if max_size and size_mb and size_mb > max_size:
        return False, "item is above maximum size"
    category = str(rule.get("category") or "").strip().lower()
    if category and category not in str(item.get("category") or "").lower() and category not in title.lower():
        return False, "category did not match"
    quality = str(rule.get("quality") or "").strip().lower()
    if quality and quality not in title.lower():
        return False, "quality did not match"
    wanted_season = rule.get("season")
    wanted_episode = rule.get("episode")
    found_season, found_episode = _season_episode(title)
    if wanted_season not in (None, "", 0) and int(wanted_season) != int(found_season or -1):
        return False, "season did not match"
    if wanted_episode not in (None, "", 0) and int(wanted_episode) != int(found_episode or -1):
        return False, "episode did not match"
    return True, "matched"


def _log(user_id: int, profile_id: int, feed_id: int | None, rule_id: int | None, item: dict, status: str, message: str) -> None:
    with connect() as conn:
        try:
            conn.execute(
                "INSERT INTO rss_history(user_id,profile_id,feed_id,rule_id,title,link,status,message,created_at) VALUES(?,?,?,?,?,?,?,?,?)",
                (user_id, profile_id, feed_id, rule_id, item.get("title"), item.get("link"), status, message, utcnow()),
            )
        except Exception:
            # Note: Duplicate successful RSS matches are ignored to prevent recurring duplicate downloads.
            pass


def check(profile: dict, user_id: int | None = None, only_due: bool = False) -> dict:
    user_id = user_id or default_user_id()
    profile_id = int(profile["id"])
    now = utcnow()
    with connect() as conn:
        if only_due:
            feeds = conn.execute("SELECT * FROM rss_feeds WHERE user_id=? AND profile_id=? AND enabled=1 AND (next_check_at IS NULL OR next_check_at<=?)", (user_id, profile_id, now)).fetchall()
        else:
            feeds = conn.execute("SELECT * FROM rss_feeds WHERE user_id=? AND profile_id=? AND enabled=1", (user_id, profile_id)).fetchall()
        rules = conn.execute("SELECT * FROM rss_rules WHERE user_id=? AND profile_id=? AND enabled=1", (user_id, profile_id)).fetchall()
    queued = 0
    tested = 0
    errors: list[dict] = []
    for feed in feeds:
        interval = max(5, int(feed.get("interval_minutes") or 30))
        next_check = (datetime.now(timezone.utc) + timedelta(minutes=interval)).isoformat(timespec="seconds")
        try:
            items = fetch_feed(feed["url"])
            for item in items:
                for rule in rules:
                    matched, reason = matches_rule(rule, item)
                    tested += 1
                    if not matched:
                        continue
                    link = item.get("link") or ""
                    if not link:
                        _log(user_id, profile_id, feed["id"], rule["id"], item, "skipped", "missing link")
                        continue
                    enqueue("add_magnet", profile_id, {"uri": link, "start": bool(rule["start"]), "directory": rule.get("save_path") or rtorrent.default_download_path(profile), "label": rule.get("label") or "", "source": "rss"}, user_id=user_id)
                    queued += 1
                    _log(user_id, profile_id, feed["id"], rule["id"], item, "queued", reason)
            with connect() as conn:
                conn.execute("UPDATE rss_feeds SET last_error=NULL,last_checked_at=?,next_check_at=?,updated_at=? WHERE id=?", (now, next_check, now, feed["id"]))
        except Exception as exc:
            errors.append({"feed_id": feed.get("id"), "error": str(exc)})
            with connect() as conn:
                conn.execute("UPDATE rss_feeds SET last_error=?,last_checked_at=?,next_check_at=?,updated_at=? WHERE id=?", (str(exc), now, next_check, now, feed["id"]))
    return {"queued": queued, "tested": tested, "feeds_checked": len(feeds), "errors": errors}


def test_rule(feed_url: str, rule: dict) -> dict:
    items = fetch_feed(feed_url)
    matches = []
    rejected = []
    for item in items[:100]:
        matched, reason = matches_rule(rule, item)
        target = matches if matched else rejected
        target.append({**item, "reason": reason})
    return {"matches": matches[:50], "rejected": rejected[:50], "total": len(items)}


_scheduler_started = False


def start_scheduler(socketio=None) -> None:
    global _scheduler_started
    if _scheduler_started:
        return
    _scheduler_started = True

    def loop() -> None:
        # Note: The lightweight RSS scheduler uses persisted next_check_at values, so restarts do not reset cadence.
        while True:
            try:
                from .preferences import get_profile
                with connect() as conn:
                    profiles = conn.execute("SELECT DISTINCT user_id, profile_id FROM rss_feeds WHERE enabled=1 AND profile_id IS NOT NULL").fetchall()
                for row in profiles:
                    profile = get_profile(int(row["profile_id"]), int(row["user_id"]))
                    if profile:
                        result = check(profile, int(row["user_id"]), only_due=True)
                        if socketio and result.get("queued"):
                            socketio.emit("rss_checked", {"profile_id": profile["id"], **result}, to=f"profile:{profile['id']}")
            except Exception:
                pass
            time.sleep(60)

    if socketio:
        socketio.start_background_task(loop)
    else:
        import threading
        threading.Thread(target=loop, daemon=True, name="pytorrent-rss-scheduler").start()