sync errors

This commit is contained in:
2026-05-10 18:02:06 +02:00
parent e754b14085
commit 96e555de0a
5 changed files with 154 additions and 23 deletions
+42 -1
View File
@@ -68,6 +68,12 @@ def init_db() -> None:
status TEXT DEFAULT 'running', status TEXT DEFAULT 'running',
error_msg TEXT error_msg TEXT
); );
CREATE TABLE IF NOT EXISTS remote_zip_cache (
remote_path TEXT PRIMARY KEY,
file_size INTEGER NOT NULL,
cached_at TEXT NOT NULL
);
""") """)
@@ -110,6 +116,39 @@ def get_all_processed_paths() -> set[str]:
return {row["remote_path"] for row in rows} return {row["remote_path"] for row in rows}
# --- Remote zip cache ---
def get_remote_zip_cache() -> list[tuple[str, int]]:
"""Return cached (remote_path, file_size) tuples."""
with get_db() as conn:
rows = conn.execute("SELECT remote_path, file_size FROM remote_zip_cache").fetchall()
return [(row["remote_path"], row["file_size"]) for row in rows]
def upsert_remote_zip_cache(zips: list[tuple[str, int]]) -> None:
"""Bulk-insert or replace cache entries. zips is a list of (remote_path, file_size)."""
now = _now()
with get_db() as conn:
conn.executemany(
"INSERT INTO remote_zip_cache (remote_path, file_size, cached_at) VALUES (?,?,?)"
" ON CONFLICT(remote_path) DO UPDATE SET file_size=excluded.file_size, cached_at=excluded.cached_at",
[(path, size, now) for path, size in zips],
)
def get_remote_cache_info() -> dict:
with get_db() as conn:
row = conn.execute(
"SELECT COUNT(*) as count, MAX(cached_at) as last_scan FROM remote_zip_cache"
).fetchone()
return {"count": row["count"], "last_scan": row["last_scan"]}
def clear_remote_zip_cache() -> int:
with get_db() as conn:
return conn.execute("DELETE FROM remote_zip_cache").rowcount
def mark_zip_processed(remote_path: str, file_size: int, status: str, error_msg: str | None = None) -> None: def mark_zip_processed(remote_path: str, file_size: int, status: str, error_msg: str | None = None) -> None:
with get_db() as conn: with get_db() as conn:
conn.execute( conn.execute(
@@ -212,11 +251,13 @@ def get_stats() -> dict:
def clear_sync_data() -> dict: def clear_sync_data() -> dict:
"""Delete all processed_zips, uploaded_books, and sync_runs rows. Settings are kept.""" """Delete all processed_zips, uploaded_books, and sync_runs rows. Settings are kept.
Also resets the remote scan timestamp so the next sync does a full rescan."""
with get_db() as conn: with get_db() as conn:
zips = conn.execute("DELETE FROM processed_zips").rowcount zips = conn.execute("DELETE FROM processed_zips").rowcount
books = conn.execute("DELETE FROM uploaded_books").rowcount books = conn.execute("DELETE FROM uploaded_books").rowcount
runs = conn.execute("DELETE FROM sync_runs").rowcount runs = conn.execute("DELETE FROM sync_runs").rowcount
conn.execute("DELETE FROM settings WHERE key = 'remote_cache_last_scan'")
return {"zips": zips, "books": books, "runs": runs} return {"zips": zips, "books": books, "runs": runs}
+11
View File
@@ -63,6 +63,7 @@ async def dashboard(request: Request):
zips = [dict(z) for z in db.get_recent_zips(20)] zips = [dict(z) for z in db.get_recent_zips(20)]
interval = int(db.get_setting("scheduler_interval_minutes", "0") or "0") interval = int(db.get_setting("scheduler_interval_minutes", "0") or "0")
batch_size = int(db.get_setting("sync_batch_size", "0") or "0") batch_size = int(db.get_setting("sync_batch_size", "0") or "0")
cache_info = db.get_remote_cache_info()
return templates.TemplateResponse(request, "index.html", { return templates.TemplateResponse(request, "index.html", {
"stats": stats, "stats": stats,
"runs": runs, "runs": runs,
@@ -71,6 +72,7 @@ async def dashboard(request: Request):
"next_run": next_run_time(), "next_run": next_run_time(),
"interval": interval, "interval": interval,
"batch_size": batch_size, "batch_size": batch_size,
"cache_info": cache_info,
}) })
@@ -158,6 +160,15 @@ async def trigger_test_sync(background_tasks: BackgroundTasks):
return RedirectResponse("/?test_started=1", status_code=303) return RedirectResponse("/?test_started=1", status_code=303)
@app.post("/sync/rescan")
async def trigger_rescan(background_tasks: BackgroundTasks):
if sync.is_running():
return RedirectResponse("/?already_running=1", status_code=303)
cfg = config.load()
background_tasks.add_task(sftp_module.refresh_remote_zip_cache, cfg.sftp)
return RedirectResponse("/?rescan_started=1", status_code=303)
# --- Connection tests --- # --- Connection tests ---
@app.get("/api/test/ssh") @app.get("/api/test/ssh")
+85 -21
View File
@@ -4,6 +4,7 @@ import shlex
import socket import socket
import time import time
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path from pathlib import Path
import paramiko import paramiko
@@ -80,29 +81,67 @@ def test_connection(cfg: SFTPConfig) -> tuple[bool, str]:
def list_new_zips(cfg: SFTPConfig, max_results: int | None = None) -> list[RemoteZip]: def list_new_zips(cfg: SFTPConfig, max_results: int | None = None) -> list[RemoteZip]:
last_scan = db.get_setting("remote_cache_last_scan")
transport = _make_transport(cfg) transport = _make_transport(cfg)
try: try:
t0 = time.monotonic() t0 = time.monotonic()
all_zips = _find_remote_zips(transport, cfg.remote_path) if last_scan:
log.info("Remote find done in %.1fs — %d zip(s) found", time.monotonic() - t0, len(all_zips)) # Fast incremental: prune directories not modified since last scan.
# Adding a file/dir to a directory updates that directory's mtime,
t1 = time.monotonic() # so we safely skip entire subtrees that haven't changed.
processed = db.get_all_processed_paths() cutoff = _scan_cutoff(last_scan)
log.info("DB lookup done in %.1fs — %d path(s) already processed", time.monotonic() - t1, len(processed)) log.info("Incremental scan — looking for directories modified since %s ...", cutoff)
new_remote = _find_remote_zips(transport, cfg.remote_path, newer_than=cutoff)
new_zips: list[RemoteZip] = [] log.info("Incremental scan done in %.1fs — %d new zip(s) on remote", time.monotonic() - t0, len(new_remote))
for zip_info in all_zips: else:
if zip_info.remote_path not in processed: log.info("First run — full remote scan (may take several minutes for large trees) ...")
new_zips.append(zip_info) new_remote = _find_remote_zips(transport, cfg.remote_path)
if max_results and len(new_zips) >= max_results: log.info("Full scan done in %.1fs — %d zip(s) found", time.monotonic() - t0, len(new_remote))
log.info("Reached limit of %d", max_results)
break
log.info("%d new zip(s) to process", len(new_zips))
return new_zips
finally: finally:
transport.close() transport.close()
# Record scan time, then update cache with any new entries found
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
db.set_setting("remote_cache_last_scan", now_str)
if new_remote:
db.upsert_remote_zip_cache([(z.remote_path, z.file_size) for z in new_remote])
log.info("Cache updated with %d new entry(ies)", len(new_remote))
# Filter full cache against already-processed paths
t1 = time.monotonic()
all_cached = db.get_remote_zip_cache()
processed = db.get_all_processed_paths()
log.info("DB lookup done in %.1fs — cache: %d, processed: %d", time.monotonic() - t1, len(all_cached), len(processed))
new_zips: list[RemoteZip] = []
for path, size in all_cached:
if path not in processed:
new_zips.append(RemoteZip(remote_path=path, file_size=size))
if max_results and len(new_zips) >= max_results:
break
log.info("%d zip(s) to process", len(new_zips))
return new_zips
def refresh_remote_zip_cache(cfg: SFTPConfig) -> int:
"""Force a full remote scan, replacing the entire cache. Used by the manual rescan button."""
log.info("Forced full remote cache refresh ...")
t0 = time.monotonic()
transport = _make_transport(cfg)
try:
all_zips = _find_remote_zips(transport, cfg.remote_path)
finally:
transport.close()
log.info("Full scan done in %.1fs — %d zip(s)", time.monotonic() - t0, len(all_zips))
db.clear_remote_zip_cache()
db.upsert_remote_zip_cache([(z.remote_path, z.file_size) for z in all_zips])
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
db.set_setting("remote_cache_last_scan", now_str)
log.info("Cache refreshed: %d zip(s) stored", len(all_zips))
return len(all_zips)
def download(cfg: SFTPConfig, remote_zip: RemoteZip, dest_dir: str) -> Path: def download(cfg: SFTPConfig, remote_zip: RemoteZip, dest_dir: str) -> Path:
dest = Path(dest_dir) dest = Path(dest_dir)
@@ -119,14 +158,27 @@ def download(cfg: SFTPConfig, remote_zip: RemoteZip, dest_dir: str) -> Path:
return local_path return local_path
def _find_remote_zips(transport: paramiko.Transport, remote_path: str) -> list[RemoteZip]: def _find_remote_zips(transport: paramiko.Transport, remote_path: str, newer_than: str | None = None) -> list[RemoteZip]:
"""Single SSH exec: find all .zip files server-side. Vastly faster than per-directory SFTP calls.""" """Run find on the remote host, streaming results with progress logging every 30 s."""
channel = transport.open_session() channel = transport.open_session()
cmd = f"find {shlex.quote(remote_path)} -type f -iname '*.zip' -printf '%s\\t%p\\n'"
log.info("Running remote find under %s ...", remote_path) if newer_than:
# Prune entire directory subtrees whose mtime predates the cutoff.
# A directory's mtime is updated when entries are added inside it,
# so old-mtime dirs are guaranteed to contain no new files.
cmd = (
f"find {shlex.quote(remote_path)}"
f" \\( -type d ! -newermt {shlex.quote(newer_than)} -prune \\)"
f" -o \\( -type f -iname '*.zip' -printf '%s\\t%p\\n' \\)"
)
else:
cmd = f"find {shlex.quote(remote_path)} -type f -iname '*.zip' -printf '%s\\t%p\\n'"
channel.exec_command(cmd) channel.exec_command(cmd)
zips: list[RemoteZip] = [] zips: list[RemoteZip] = []
last_log = time.monotonic()
for line in channel.makefile("r", -1): for line in channel.makefile("r", -1):
line = line.rstrip("\n") line = line.rstrip("\n")
if "\t" not in line: if "\t" not in line:
@@ -137,9 +189,21 @@ def _find_remote_zips(transport: paramiko.Transport, remote_path: str) -> list[R
except ValueError: except ValueError:
continue continue
now = time.monotonic()
if now - last_log >= 30:
log.info("Find in progress: %d zip(s) found so far ...", len(zips))
last_log = now
stderr_out = channel.makefile_stderr("r", -1).read().strip() stderr_out = channel.makefile_stderr("r", -1).read().strip()
if stderr_out: if stderr_out:
log.warning("find stderr: %s", stderr_out[:500]) log.warning("find stderr: %s", stderr_out[:500])
channel.recv_exit_status() channel.recv_exit_status()
channel.close() channel.close()
return zips return zips
def _scan_cutoff(last_scan: str) -> str:
"""Subtract 5-minute safety buffer from last-scan timestamp to handle clock skew."""
dt = datetime.strptime(last_scan, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
dt -= timedelta(minutes=5)
return dt.strftime("%Y-%m-%d %H:%M:%S")
+1
View File
@@ -156,6 +156,7 @@ tr:hover td { background: rgba(255,255,255,0.02); }
.btn-danger { background: #dc2626; color: #fff; border: 1px solid #dc2626; } .btn-danger { background: #dc2626; color: #fff; border: 1px solid #dc2626; }
.btn-danger:hover { background: #b91c1c; border-color: #b91c1c; } .btn-danger:hover { background: #b91c1c; border-color: #b91c1c; }
.danger-zone { border-color: rgba(220,38,38,0.4); } .danger-zone { border-color: rgba(220,38,38,0.4); }
.cache-status { margin-bottom: 1rem; }
/* Forms */ /* Forms */
.form-section { .form-section {
+15 -1
View File
@@ -11,11 +11,14 @@
{% if next_run %} — next: {{ next_run }}{% endif %} {% if next_run %} — next: {{ next_run }}{% endif %}
</span> </span>
{% endif %} {% endif %}
<form method="post" action="/sync/rescan" style="display:inline">
<button class="btn btn-secondary" {% if sync_running %}disabled{% endif %}>Rescan remote</button>
</form>
<form method="post" action="/sync" style="display:inline"> <form method="post" action="/sync" style="display:inline">
{% if sync_running %} {% if sync_running %}
<button class="btn btn-disabled" disabled>Sync running…</button> <button class="btn btn-disabled" disabled>Sync running…</button>
{% else %} {% else %}
<button class="btn btn-primary">Run Sync Now</button> <button class="btn btn-primary" {% if cache_info.count == 0 %}title="Run 'Rescan remote' first to populate the zip list"{% endif %}>Run Sync Now</button>
{% endif %} {% endif %}
</form> </form>
{% if not sync_running %} {% if not sync_running %}
@@ -35,10 +38,21 @@
{% if request.query_params.get("test_started") %} {% if request.query_params.get("test_started") %}
<div class="alert alert-success">Test sync started — processing 1 archive.</div> <div class="alert alert-success">Test sync started — processing 1 archive.</div>
{% endif %} {% endif %}
{% if request.query_params.get("rescan_started") %}
<div class="alert alert-success">Remote rescan started — this will take a few minutes. Check logs for progress.</div>
{% endif %}
{% if request.query_params.get("already_running") %} {% if request.query_params.get("already_running") %}
<div class="alert alert-warning">A sync is already running.</div> <div class="alert alert-warning">A sync is already running.</div>
{% endif %} {% endif %}
<div class="cache-status">
{% if cache_info.count > 0 %}
<span class="muted small">Remote cache: <strong>{{ cache_info.count }}</strong> zip(s) &mdash; last scanned {{ cache_info.last_scan[:19] if cache_info.last_scan else "never" }} UTC</span>
{% else %}
<span class="muted small" style="color:var(--warning)">Remote cache empty &mdash; first sync will run a full scan (may take several minutes).</span>
{% endif %}
</div>
<div class="stats-grid"> <div class="stats-grid">
<div class="stat-card"> <div class="stat-card">
<div class="stat-value">{{ stats.total_zips }}</div> <div class="stat-value">{{ stats.total_zips }}</div>