diff --git a/db.py b/db.py index 6ba7cd1..02bc1f9 100644 --- a/db.py +++ b/db.py @@ -103,6 +103,13 @@ def is_zip_processed(remote_path: str) -> bool: return row is not None +def get_all_processed_paths() -> set[str]: + """Return all processed remote paths as a set for fast bulk membership checks.""" + with get_db() as conn: + rows = conn.execute("SELECT remote_path FROM processed_zips").fetchall() + return {row["remote_path"] for row in rows} + + def mark_zip_processed(remote_path: str, file_size: int, status: str, error_msg: str | None = None) -> None: with get_db() as conn: conn.execute( diff --git a/sftp.py b/sftp.py index 0033307..7981df1 100644 --- a/sftp.py +++ b/sftp.py @@ -2,6 +2,7 @@ import io import logging import shlex import socket +import time from dataclasses import dataclass from pathlib import Path @@ -81,12 +82,17 @@ def test_connection(cfg: SFTPConfig) -> tuple[bool, str]: def list_new_zips(cfg: SFTPConfig, max_results: int | None = None) -> list[RemoteZip]: transport = _make_transport(cfg) try: + t0 = time.monotonic() all_zips = _find_remote_zips(transport, cfg.remote_path) - log.info("Remote scan done: %d zip(s) found", len(all_zips)) + log.info("Remote find done in %.1fs — %d zip(s) found", time.monotonic() - t0, len(all_zips)) + + t1 = time.monotonic() + processed = db.get_all_processed_paths() + log.info("DB lookup done in %.1fs — %d path(s) already processed", time.monotonic() - t1, len(processed)) new_zips: list[RemoteZip] = [] for zip_info in all_zips: - if not db.is_zip_processed(zip_info.remote_path): + if zip_info.remote_path not in processed: new_zips.append(zip_info) if max_results and len(new_zips) >= max_results: log.info("Reached limit of %d", max_results) diff --git a/sync.py b/sync.py index 0f0c137..e7b39d6 100644 --- a/sync.py +++ b/sync.py @@ -1,5 +1,6 @@ import logging import threading +import time from pathlib import Path import config @@ -72,11 +73,18 @@ def run_sync(limit: int | None = None) -> None: zip_error = None local_zip = None try: + t0 = time.monotonic() local_zip = sftp_module.download(cfg.sftp, remote_zip, str(work_dir / "downloads")) + log.info("Download done in %.1fs (%.1f MB)", time.monotonic() - t0, local_zip.stat().st_size / 1_048_576) + + t1 = time.monotonic() books = extractor.extract(local_zip, work_dir / "extracted") + log.info("Extract done in %.1fs — %d book(s)", time.monotonic() - t1, len(books)) for book in books: + t2 = time.monotonic() status = client.upload(book, zip_source=remote_zip.remote_path) + log.info("Upload '%s' → %s (%.1fs)", book.name, status, time.monotonic() - t2) if status == "uploaded": counters["books_uploaded"] += 1 elif status == "skipped_duplicate":