check for doubles
This commit is contained in:
@@ -64,6 +64,14 @@ def run_sync(limit: int | None = None) -> None:
|
|||||||
total_batches = -(-len(new_zips) // batch_size) # ceiling division
|
total_batches = -(-len(new_zips) // batch_size) # ceiling division
|
||||||
client = CalibreClient(cfg.calibre)
|
client = CalibreClient(cfg.calibre)
|
||||||
|
|
||||||
|
# Pre-load existing book titles so duplicate detection doesn't need per-book OPDS searches
|
||||||
|
try:
|
||||||
|
from uploader import fetch_all_books
|
||||||
|
existing = fetch_all_books(cfg.calibre)
|
||||||
|
client.preload_existing_titles(existing)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("Could not pre-load existing books (%s) — will fall back to per-book OPDS search", exc)
|
||||||
|
|
||||||
for batch_num, i in enumerate(range(0, len(new_zips), batch_size), start=1):
|
for batch_num, i in enumerate(range(0, len(new_zips), batch_size), start=1):
|
||||||
chunk = new_zips[i : i + batch_size]
|
chunk = new_zips[i : i + batch_size]
|
||||||
log.info("Batch %d/%d — processing %d zip(s)", batch_num, total_batches, len(chunk))
|
log.info("Batch %d/%d — processing %d zip(s)", batch_num, total_batches, len(chunk))
|
||||||
|
|||||||
+24
-4
@@ -36,6 +36,17 @@ class CalibreClient:
|
|||||||
self._authenticated = False
|
self._authenticated = False
|
||||||
self._upload_csrf: str | None = None
|
self._upload_csrf: str | None = None
|
||||||
self._consecutive_failures = 0
|
self._consecutive_failures = 0
|
||||||
|
# Pre-loaded title word-sets for fast duplicate detection (set by preload_existing_titles)
|
||||||
|
self._existing_title_sets: list[frozenset[str]] | None = None
|
||||||
|
|
||||||
|
def preload_existing_titles(self, books: list[dict]) -> None:
|
||||||
|
"""Build an in-memory index of normalised title keywords from a pre-fetched book list."""
|
||||||
|
self._existing_title_sets = [
|
||||||
|
frozenset(_normalize_words(b.get("title", "")))
|
||||||
|
for b in books
|
||||||
|
if b.get("title")
|
||||||
|
]
|
||||||
|
log.info("Pre-loaded %d existing book titles for duplicate detection", len(self._existing_title_sets))
|
||||||
|
|
||||||
def _ensure_auth(self) -> None:
|
def _ensure_auth(self) -> None:
|
||||||
if self._authenticated:
|
if self._authenticated:
|
||||||
@@ -58,11 +69,22 @@ class CalibreClient:
|
|||||||
log.info("Authenticated to Calibre-Web at %s", self._cfg.url)
|
log.info("Authenticated to Calibre-Web at %s", self._cfg.url)
|
||||||
|
|
||||||
def _exists_in_calibre(self, filename: str) -> bool:
|
def _exists_in_calibre(self, filename: str) -> bool:
|
||||||
"""Search Calibre-Web OPDS for a title matching this filename. Returns True if likely duplicate."""
|
"""Check whether a book already exists in Calibre-Web. Returns True if likely duplicate."""
|
||||||
keywords = _keywords_from_filename(filename)
|
keywords = _keywords_from_filename(filename)
|
||||||
if len(keywords) < 2:
|
if len(keywords) < 2:
|
||||||
return False
|
return False
|
||||||
|
our_words = set(keywords)
|
||||||
|
|
||||||
|
# Fast path: check pre-loaded title index (available when sync pre-fetches all books)
|
||||||
|
if self._existing_title_sets is not None:
|
||||||
|
for their_words in self._existing_title_sets:
|
||||||
|
overlap = len(our_words & their_words)
|
||||||
|
if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6):
|
||||||
|
log.info("Duplicate (preloaded index): '%s'", filename)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Slow path fallback: OPDS search (used when no index is available)
|
||||||
query = " ".join(keywords[:6])
|
query = " ".join(keywords[:6])
|
||||||
try:
|
try:
|
||||||
resp = self._session.get(
|
resp = self._session.get(
|
||||||
@@ -76,13 +98,11 @@ class CalibreClient:
|
|||||||
if not calibre_titles:
|
if not calibre_titles:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
our_words = set(keywords)
|
|
||||||
for title in calibre_titles:
|
for title in calibre_titles:
|
||||||
their_words = set(_normalize_words(title))
|
their_words = set(_normalize_words(title))
|
||||||
overlap = len(our_words & their_words)
|
overlap = len(our_words & their_words)
|
||||||
# Match if 3+ words in common, or 60%+ of our keywords match
|
|
||||||
if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6):
|
if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6):
|
||||||
log.info("Duplicate found in Calibre-Web: '%s' ~ '%s'", filename, title)
|
log.info("Duplicate (OPDS search): '%s' ~ '%s'", filename, title)
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)
|
log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)
|
||||||
|
|||||||
Reference in New Issue
Block a user