sync errors

This commit is contained in:
2026-05-10 18:02:06 +02:00
parent e754b14085
commit 96e555de0a
5 changed files with 154 additions and 23 deletions
+42 -1
View File
@@ -68,6 +68,12 @@ def init_db() -> None:
status TEXT DEFAULT 'running',
error_msg TEXT
);
CREATE TABLE IF NOT EXISTS remote_zip_cache (
remote_path TEXT PRIMARY KEY,
file_size INTEGER NOT NULL,
cached_at TEXT NOT NULL
);
""")
@@ -110,6 +116,39 @@ def get_all_processed_paths() -> set[str]:
return {row["remote_path"] for row in rows}
# --- Remote zip cache ---
def get_remote_zip_cache() -> list[tuple[str, int]]:
"""Return cached (remote_path, file_size) tuples."""
with get_db() as conn:
rows = conn.execute("SELECT remote_path, file_size FROM remote_zip_cache").fetchall()
return [(row["remote_path"], row["file_size"]) for row in rows]
def upsert_remote_zip_cache(zips: list[tuple[str, int]]) -> None:
"""Bulk-insert or replace cache entries. zips is a list of (remote_path, file_size)."""
now = _now()
with get_db() as conn:
conn.executemany(
"INSERT INTO remote_zip_cache (remote_path, file_size, cached_at) VALUES (?,?,?)"
" ON CONFLICT(remote_path) DO UPDATE SET file_size=excluded.file_size, cached_at=excluded.cached_at",
[(path, size, now) for path, size in zips],
)
def get_remote_cache_info() -> dict:
with get_db() as conn:
row = conn.execute(
"SELECT COUNT(*) as count, MAX(cached_at) as last_scan FROM remote_zip_cache"
).fetchone()
return {"count": row["count"], "last_scan": row["last_scan"]}
def clear_remote_zip_cache() -> int:
with get_db() as conn:
return conn.execute("DELETE FROM remote_zip_cache").rowcount
def mark_zip_processed(remote_path: str, file_size: int, status: str, error_msg: str | None = None) -> None:
with get_db() as conn:
conn.execute(
@@ -212,11 +251,13 @@ def get_stats() -> dict:
def clear_sync_data() -> dict:
"""Delete all processed_zips, uploaded_books, and sync_runs rows. Settings are kept."""
"""Delete all processed_zips, uploaded_books, and sync_runs rows. Settings are kept.
Also resets the remote scan timestamp so the next sync does a full rescan."""
with get_db() as conn:
zips = conn.execute("DELETE FROM processed_zips").rowcount
books = conn.execute("DELETE FROM uploaded_books").rowcount
runs = conn.execute("DELETE FROM sync_runs").rowcount
conn.execute("DELETE FROM settings WHERE key = 'remote_cache_last_scan'")
return {"zips": zips, "books": books, "runs": runs}
+11
View File
@@ -63,6 +63,7 @@ async def dashboard(request: Request):
zips = [dict(z) for z in db.get_recent_zips(20)]
interval = int(db.get_setting("scheduler_interval_minutes", "0") or "0")
batch_size = int(db.get_setting("sync_batch_size", "0") or "0")
cache_info = db.get_remote_cache_info()
return templates.TemplateResponse(request, "index.html", {
"stats": stats,
"runs": runs,
@@ -71,6 +72,7 @@ async def dashboard(request: Request):
"next_run": next_run_time(),
"interval": interval,
"batch_size": batch_size,
"cache_info": cache_info,
})
@@ -158,6 +160,15 @@ async def trigger_test_sync(background_tasks: BackgroundTasks):
return RedirectResponse("/?test_started=1", status_code=303)
@app.post("/sync/rescan")
async def trigger_rescan(background_tasks: BackgroundTasks):
if sync.is_running():
return RedirectResponse("/?already_running=1", status_code=303)
cfg = config.load()
background_tasks.add_task(sftp_module.refresh_remote_zip_cache, cfg.sftp)
return RedirectResponse("/?rescan_started=1", status_code=303)
# --- Connection tests ---
@app.get("/api/test/ssh")
+84 -20
View File
@@ -4,6 +4,7 @@ import shlex
import socket
import time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path
import paramiko
@@ -80,29 +81,67 @@ def test_connection(cfg: SFTPConfig) -> tuple[bool, str]:
def list_new_zips(cfg: SFTPConfig, max_results: int | None = None) -> list[RemoteZip]:
last_scan = db.get_setting("remote_cache_last_scan")
transport = _make_transport(cfg)
try:
t0 = time.monotonic()
all_zips = _find_remote_zips(transport, cfg.remote_path)
log.info("Remote find done in %.1fs — %d zip(s) found", time.monotonic() - t0, len(all_zips))
t1 = time.monotonic()
processed = db.get_all_processed_paths()
log.info("DB lookup done in %.1fs — %d path(s) already processed", time.monotonic() - t1, len(processed))
new_zips: list[RemoteZip] = []
for zip_info in all_zips:
if zip_info.remote_path not in processed:
new_zips.append(zip_info)
if max_results and len(new_zips) >= max_results:
log.info("Reached limit of %d", max_results)
break
log.info("%d new zip(s) to process", len(new_zips))
return new_zips
if last_scan:
# Fast incremental: prune directories not modified since last scan.
# Adding a file/dir to a directory updates that directory's mtime,
# so we safely skip entire subtrees that haven't changed.
cutoff = _scan_cutoff(last_scan)
log.info("Incremental scan — looking for directories modified since %s ...", cutoff)
new_remote = _find_remote_zips(transport, cfg.remote_path, newer_than=cutoff)
log.info("Incremental scan done in %.1fs — %d new zip(s) on remote", time.monotonic() - t0, len(new_remote))
else:
log.info("First run — full remote scan (may take several minutes for large trees) ...")
new_remote = _find_remote_zips(transport, cfg.remote_path)
log.info("Full scan done in %.1fs — %d zip(s) found", time.monotonic() - t0, len(new_remote))
finally:
transport.close()
# Record scan time, then update cache with any new entries found
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
db.set_setting("remote_cache_last_scan", now_str)
if new_remote:
db.upsert_remote_zip_cache([(z.remote_path, z.file_size) for z in new_remote])
log.info("Cache updated with %d new entry(ies)", len(new_remote))
# Filter full cache against already-processed paths
t1 = time.monotonic()
all_cached = db.get_remote_zip_cache()
processed = db.get_all_processed_paths()
log.info("DB lookup done in %.1fs — cache: %d, processed: %d", time.monotonic() - t1, len(all_cached), len(processed))
new_zips: list[RemoteZip] = []
for path, size in all_cached:
if path not in processed:
new_zips.append(RemoteZip(remote_path=path, file_size=size))
if max_results and len(new_zips) >= max_results:
break
log.info("%d zip(s) to process", len(new_zips))
return new_zips
def refresh_remote_zip_cache(cfg: SFTPConfig) -> int:
"""Force a full remote scan, replacing the entire cache. Used by the manual rescan button."""
log.info("Forced full remote cache refresh ...")
t0 = time.monotonic()
transport = _make_transport(cfg)
try:
all_zips = _find_remote_zips(transport, cfg.remote_path)
finally:
transport.close()
log.info("Full scan done in %.1fs — %d zip(s)", time.monotonic() - t0, len(all_zips))
db.clear_remote_zip_cache()
db.upsert_remote_zip_cache([(z.remote_path, z.file_size) for z in all_zips])
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
db.set_setting("remote_cache_last_scan", now_str)
log.info("Cache refreshed: %d zip(s) stored", len(all_zips))
return len(all_zips)
def download(cfg: SFTPConfig, remote_zip: RemoteZip, dest_dir: str) -> Path:
dest = Path(dest_dir)
@@ -119,14 +158,27 @@ def download(cfg: SFTPConfig, remote_zip: RemoteZip, dest_dir: str) -> Path:
return local_path
def _find_remote_zips(transport: paramiko.Transport, remote_path: str) -> list[RemoteZip]:
"""Single SSH exec: find all .zip files server-side. Vastly faster than per-directory SFTP calls."""
def _find_remote_zips(transport: paramiko.Transport, remote_path: str, newer_than: str | None = None) -> list[RemoteZip]:
"""Run find on the remote host, streaming results with progress logging every 30 s."""
channel = transport.open_session()
if newer_than:
# Prune entire directory subtrees whose mtime predates the cutoff.
# A directory's mtime is updated when entries are added inside it,
# so old-mtime dirs are guaranteed to contain no new files.
cmd = (
f"find {shlex.quote(remote_path)}"
f" \\( -type d ! -newermt {shlex.quote(newer_than)} -prune \\)"
f" -o \\( -type f -iname '*.zip' -printf '%s\\t%p\\n' \\)"
)
else:
cmd = f"find {shlex.quote(remote_path)} -type f -iname '*.zip' -printf '%s\\t%p\\n'"
log.info("Running remote find under %s ...", remote_path)
channel.exec_command(cmd)
zips: list[RemoteZip] = []
last_log = time.monotonic()
for line in channel.makefile("r", -1):
line = line.rstrip("\n")
if "\t" not in line:
@@ -137,9 +189,21 @@ def _find_remote_zips(transport: paramiko.Transport, remote_path: str) -> list[R
except ValueError:
continue
now = time.monotonic()
if now - last_log >= 30:
log.info("Find in progress: %d zip(s) found so far ...", len(zips))
last_log = now
stderr_out = channel.makefile_stderr("r", -1).read().strip()
if stderr_out:
log.warning("find stderr: %s", stderr_out[:500])
channel.recv_exit_status()
channel.close()
return zips
def _scan_cutoff(last_scan: str) -> str:
"""Subtract 5-minute safety buffer from last-scan timestamp to handle clock skew."""
dt = datetime.strptime(last_scan, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
dt -= timedelta(minutes=5)
return dt.strftime("%Y-%m-%d %H:%M:%S")
+1
View File
@@ -156,6 +156,7 @@ tr:hover td { background: rgba(255,255,255,0.02); }
.btn-danger { background: #dc2626; color: #fff; border: 1px solid #dc2626; }
.btn-danger:hover { background: #b91c1c; border-color: #b91c1c; }
.danger-zone { border-color: rgba(220,38,38,0.4); }
.cache-status { margin-bottom: 1rem; }
/* Forms */
.form-section {
+15 -1
View File
@@ -11,11 +11,14 @@
{% if next_run %} — next: {{ next_run }}{% endif %}
</span>
{% endif %}
<form method="post" action="/sync/rescan" style="display:inline">
<button class="btn btn-secondary" {% if sync_running %}disabled{% endif %}>Rescan remote</button>
</form>
<form method="post" action="/sync" style="display:inline">
{% if sync_running %}
<button class="btn btn-disabled" disabled>Sync running…</button>
{% else %}
<button class="btn btn-primary">Run Sync Now</button>
<button class="btn btn-primary" {% if cache_info.count == 0 %}title="Run 'Rescan remote' first to populate the zip list"{% endif %}>Run Sync Now</button>
{% endif %}
</form>
{% if not sync_running %}
@@ -35,10 +38,21 @@
{% if request.query_params.get("test_started") %}
<div class="alert alert-success">Test sync started — processing 1 archive.</div>
{% endif %}
{% if request.query_params.get("rescan_started") %}
<div class="alert alert-success">Remote rescan started — this will take a few minutes. Check logs for progress.</div>
{% endif %}
{% if request.query_params.get("already_running") %}
<div class="alert alert-warning">A sync is already running.</div>
{% endif %}
<div class="cache-status">
{% if cache_info.count > 0 %}
<span class="muted small">Remote cache: <strong>{{ cache_info.count }}</strong> zip(s) &mdash; last scanned {{ cache_info.last_scan[:19] if cache_info.last_scan else "never" }} UTC</span>
{% else %}
<span class="muted small" style="color:var(--warning)">Remote cache empty &mdash; first sync will run a full scan (may take several minutes).</span>
{% endif %}
</div>
<div class="stats-grid">
<div class="stat-card">
<div class="stat-value">{{ stats.total_zips }}</div>