diff --git a/sync.py b/sync.py index cf68200..84564f3 100644 --- a/sync.py +++ b/sync.py @@ -64,6 +64,14 @@ def run_sync(limit: int | None = None) -> None: total_batches = -(-len(new_zips) // batch_size) # ceiling division client = CalibreClient(cfg.calibre) + # Pre-load existing book titles so duplicate detection doesn't need per-book OPDS searches + try: + from uploader import fetch_all_books + existing = fetch_all_books(cfg.calibre) + client.preload_existing_titles(existing) + except Exception as exc: + log.warning("Could not pre-load existing books (%s) — will fall back to per-book OPDS search", exc) + for batch_num, i in enumerate(range(0, len(new_zips), batch_size), start=1): chunk = new_zips[i : i + batch_size] log.info("Batch %d/%d — processing %d zip(s)", batch_num, total_batches, len(chunk)) diff --git a/uploader.py b/uploader.py index 6efdaf5..d28854f 100644 --- a/uploader.py +++ b/uploader.py @@ -36,6 +36,17 @@ class CalibreClient: self._authenticated = False self._upload_csrf: str | None = None self._consecutive_failures = 0 + # Pre-loaded title word-sets for fast duplicate detection (set by preload_existing_titles) + self._existing_title_sets: list[frozenset[str]] | None = None + + def preload_existing_titles(self, books: list[dict]) -> None: + """Build an in-memory index of normalised title keywords from a pre-fetched book list.""" + self._existing_title_sets = [ + frozenset(_normalize_words(b.get("title", ""))) + for b in books + if b.get("title") + ] + log.info("Pre-loaded %d existing book titles for duplicate detection", len(self._existing_title_sets)) def _ensure_auth(self) -> None: if self._authenticated: @@ -58,11 +69,22 @@ class CalibreClient: log.info("Authenticated to Calibre-Web at %s", self._cfg.url) def _exists_in_calibre(self, filename: str) -> bool: - """Search Calibre-Web OPDS for a title matching this filename. Returns True if likely duplicate.""" + """Check whether a book already exists in Calibre-Web. Returns True if likely duplicate.""" keywords = _keywords_from_filename(filename) if len(keywords) < 2: return False + our_words = set(keywords) + # Fast path: check pre-loaded title index (available when sync pre-fetches all books) + if self._existing_title_sets is not None: + for their_words in self._existing_title_sets: + overlap = len(our_words & their_words) + if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6): + log.info("Duplicate (preloaded index): '%s'", filename) + return True + return False + + # Slow path fallback: OPDS search (used when no index is available) query = " ".join(keywords[:6]) try: resp = self._session.get( @@ -76,13 +98,11 @@ class CalibreClient: if not calibre_titles: return False - our_words = set(keywords) for title in calibre_titles: their_words = set(_normalize_words(title)) overlap = len(our_words & their_words) - # Match if 3+ words in common, or 60%+ of our keywords match if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6): - log.info("Duplicate found in Calibre-Web: '%s' ~ '%s'", filename, title) + log.info("Duplicate (OPDS search): '%s' ~ '%s'", filename, title) return True except Exception as e: log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)