chages in logs

2026-06-03 09:07:58 +02:00
parent f29710b24f
commit 4628ea653d
5 changed files with 210 additions and 24 deletions
@@ -200,6 +200,8 @@ def enqueue(action_name: str, profile_id: int, payload: dict, user_id: int | Non
            "INSERT INTO jobs(id,user_id,profile_id,action,payload_json,status,attempts,max_attempts,progress_total,created_at,updated_at) VALUES(?,?,?,?,?,?,?,?,?,?,?)",
            (job_id, user_id, profile_id, action_name, json.dumps(payload), "pending", 0, max_attempts, progress_total, now, now),
        )
+    # Note: Queued jobs are now written to operation logs so work is visible before a worker starts it.
+    operation_logs.record_job_event(profile_id, action_name, "queued", payload, job_id=job_id, user_id=user_id)
    _emit("job_update", {"id": job_id, "action": action_name, "profile_id": profile_id, "status": "pending"})
    _submit_job(job_id, action_name)
    return job_id
@@ -315,6 +317,8 @@ def _run(job_id: str):
        profile = get_profile(int(job["profile_id"]), int(job["user_id"]))
        if not profile:
            _set_job(job_id, "failed", "rTorrent profile does not exist", finished=True)
+            # Note: Profile lookup failures used to appear only in the job queue; they are now persisted in operation logs too.
+            operation_logs.record_worker_event(int(job.get("profile_id") or 0), str(job.get("action") or ""), "failed", "Job failed: rTorrent profile does not exist", job_id=job_id, user_id=int(job.get("user_id") or 0), error="profile not found")
            _emit("job_update", {"id": job_id, "profile_id": job.get("profile_id"), "status": "failed", "error": "profile not found"})
            return
        profile_id = int(profile["id"])
@@ -362,6 +366,9 @@ def _run(job_id: str):
        _set_job(job_id, status, str(exc), finished=(status == "failed"))
        if status == "failed":
            operation_logs.record_job_event(int(job.get("profile_id") or 0), job.get("action"), "failed", payload, error=str(exc), job_id=job_id, user_id=int(job.get("user_id") or 0))
+        else:
+            # Note: Retried attempts are logged explicitly so transient failures are not lost between final states.
+            operation_logs.record_job_event(int(job.get("profile_id") or 0), job.get("action"), "retry", payload, error=str(exc), job_id=job_id, user_id=int(job.get("user_id") or 0))
        _emit("operation_failed", {"job_id": job_id, "action": job.get("action"), "profile_id": job.get("profile_id"), "hashes": payload.get("hashes") or [], "error": str(exc), **_job_event_meta(payload)})
        _emit("job_update", {"id": job_id, "profile_id": job.get("profile_id"), "status": status, "error": str(exc), "attempts": attempts})
        if status == "pending":
@@ -408,6 +415,8 @@ def _timeout_running_jobs() -> None:
            continue
        message = f"Watchdog timeout after {_job_timeout_seconds(profile, row)} seconds"
        _set_job(row["id"], "failed", message, finished=True)
+        # Note: Watchdog timeouts are stored in operation logs because no normal worker exception may be raised.
+        operation_logs.record_worker_event(int(row.get("profile_id") or 0), str(row.get("action") or ""), "timeout", message, job_id=row["id"], user_id=int(row.get("user_id") or 0), error=message)
        _emit("operation_failed", {"job_id": row["id"], "action": row.get("action"), "profile_id": row.get("profile_id"), "hashes": [], "error": message, "source": "watchdog"})
        _emit("job_update", {"id": row["id"], "profile_id": row.get("profile_id"), "status": "failed", "error": message})

@@ -435,6 +444,8 @@ def _resubmit_interrupted_running_jobs() -> None:
                ("Resuming interrupted job from last checkpoint", utcnow(), row["id"]),
            )
        if int(cur.rowcount or 0):
+            # Note: Interrupted jobs returned to the queue are logged so restart recovery is auditable.
+            operation_logs.record_worker_event(int(row.get("profile_id") or 0), str(row.get("action") or ""), "resubmitted", "Interrupted job resubmitted from checkpoint", job_id=row["id"], user_id=int(row.get("user_id") or 0))
            _emit("job_update", {"id": row["id"], "profile_id": row.get("profile_id"), "status": "pending", "resumed": True})
            _submit_job(row["id"], row.get("action"))

@@ -456,6 +467,8 @@ def _resubmit_stale_pending_jobs() -> None:
            continue
        with connect() as conn:
            conn.execute("UPDATE jobs SET error=?, updated_at=? WHERE id=? AND status='pending'", ("Watchdog resubmitted stale pending job", utcnow(), row["id"]))
+        # Note: Stale pending resubmits are logged to explain duplicated queue attempts after watchdog recovery.
+        operation_logs.record_worker_event(int(row.get("profile_id") or 0), str(row.get("action") or ""), "resubmitted", "Stale pending job resubmitted by watchdog", job_id=row["id"], user_id=int(row.get("user_id") or 0))
        _emit("job_update", {"id": row["id"], "profile_id": row.get("profile_id"), "status": "pending", "watchdog": True})
        _submit_job(row["id"], row.get("action"))

@@ -561,6 +574,8 @@ def cancel_job(job_id: str) -> bool:
        return False
    # Note: Emergency cancel is useful only for unfinished jobs; failed/done entries stay available for retry or log cleanup.
    _set_job(job_id, "cancelled", finished=True)
+    payload = _job_payload(row)
+    operation_logs.record_job_event(int(row.get("profile_id") or 0), row.get("action"), "cancelled", payload, error="Cancelled by user", job_id=job_id, user_id=int(row.get("user_id") or 0))
    _emit("job_update", {"id": job_id, "profile_id": row.get("profile_id"), "status": "cancelled"})
    return True

@@ -597,6 +612,7 @@ def force_job(job_id: str) -> bool:
    payload['priority_job'] = True
    with connect() as conn:
        conn.execute("UPDATE jobs SET payload_json=?, updated_at=? WHERE id=?", (json.dumps(payload), utcnow(), job_id))
+    operation_logs.record_job_event(int(row.get('profile_id') or 0), row.get('action'), 'forced', payload, job_id=job_id, user_id=int(row.get('user_id') or 0))
    _emit('job_update', {'id': job_id, 'profile_id': row.get('profile_id'), 'status': 'pending', 'forced': True})
    _submit_job(job_id, row.get('action'))
    return True
@@ -607,6 +623,8 @@ def retry_job(job_id: str) -> bool:
        return False
    with connect() as conn:
        conn.execute("UPDATE jobs SET status='pending', error='', finished_at=NULL, state_json=NULL, progress_current=0, heartbeat_at=NULL, updated_at=? WHERE id=?", (utcnow(), job_id))
+    payload = _job_payload(row)
+    operation_logs.record_job_event(int(row.get("profile_id") or 0), row.get("action"), "retry", payload, job_id=job_id, user_id=int(row.get("user_id") or 0))
    _emit("job_update", {"id": job_id, "profile_id": row.get("profile_id"), "status": "pending"})
    _submit_job(job_id, row.get("action"))
    return True