sync errors

This commit is contained in:
2026-05-10 17:49:42 +02:00
parent 4fef7fbf00
commit e754b14085
3 changed files with 23 additions and 2 deletions
+7
View File
@@ -103,6 +103,13 @@ def is_zip_processed(remote_path: str) -> bool:
return row is not None return row is not None
def get_all_processed_paths() -> set[str]:
"""Return all processed remote paths as a set for fast bulk membership checks."""
with get_db() as conn:
rows = conn.execute("SELECT remote_path FROM processed_zips").fetchall()
return {row["remote_path"] for row in rows}
def mark_zip_processed(remote_path: str, file_size: int, status: str, error_msg: str | None = None) -> None: def mark_zip_processed(remote_path: str, file_size: int, status: str, error_msg: str | None = None) -> None:
with get_db() as conn: with get_db() as conn:
conn.execute( conn.execute(
+8 -2
View File
@@ -2,6 +2,7 @@ import io
import logging import logging
import shlex import shlex
import socket import socket
import time
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
@@ -81,12 +82,17 @@ def test_connection(cfg: SFTPConfig) -> tuple[bool, str]:
def list_new_zips(cfg: SFTPConfig, max_results: int | None = None) -> list[RemoteZip]: def list_new_zips(cfg: SFTPConfig, max_results: int | None = None) -> list[RemoteZip]:
transport = _make_transport(cfg) transport = _make_transport(cfg)
try: try:
t0 = time.monotonic()
all_zips = _find_remote_zips(transport, cfg.remote_path) all_zips = _find_remote_zips(transport, cfg.remote_path)
log.info("Remote scan done: %d zip(s) found", len(all_zips)) log.info("Remote find done in %.1fs — %d zip(s) found", time.monotonic() - t0, len(all_zips))
t1 = time.monotonic()
processed = db.get_all_processed_paths()
log.info("DB lookup done in %.1fs — %d path(s) already processed", time.monotonic() - t1, len(processed))
new_zips: list[RemoteZip] = [] new_zips: list[RemoteZip] = []
for zip_info in all_zips: for zip_info in all_zips:
if not db.is_zip_processed(zip_info.remote_path): if zip_info.remote_path not in processed:
new_zips.append(zip_info) new_zips.append(zip_info)
if max_results and len(new_zips) >= max_results: if max_results and len(new_zips) >= max_results:
log.info("Reached limit of %d", max_results) log.info("Reached limit of %d", max_results)
+8
View File
@@ -1,5 +1,6 @@
import logging import logging
import threading import threading
import time
from pathlib import Path from pathlib import Path
import config import config
@@ -72,11 +73,18 @@ def run_sync(limit: int | None = None) -> None:
zip_error = None zip_error = None
local_zip = None local_zip = None
try: try:
t0 = time.monotonic()
local_zip = sftp_module.download(cfg.sftp, remote_zip, str(work_dir / "downloads")) local_zip = sftp_module.download(cfg.sftp, remote_zip, str(work_dir / "downloads"))
log.info("Download done in %.1fs (%.1f MB)", time.monotonic() - t0, local_zip.stat().st_size / 1_048_576)
t1 = time.monotonic()
books = extractor.extract(local_zip, work_dir / "extracted") books = extractor.extract(local_zip, work_dir / "extracted")
log.info("Extract done in %.1fs — %d book(s)", time.monotonic() - t1, len(books))
for book in books: for book in books:
t2 = time.monotonic()
status = client.upload(book, zip_source=remote_zip.remote_path) status = client.upload(book, zip_source=remote_zip.remote_path)
log.info("Upload '%s'%s (%.1fs)", book.name, status, time.monotonic() - t2)
if status == "uploaded": if status == "uploaded":
counters["books_uploaded"] += 1 counters["books_uploaded"] += 1
elif status == "skipped_duplicate": elif status == "skipped_duplicate":