From bfa09976b7415082f84fb64c38cce191bd886c5f Mon Sep 17 00:00:00 2001 From: grymphen Date: Tue, 12 May 2026 16:23:13 +0200 Subject: [PATCH] check for doubles --- uploader.py | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/uploader.py b/uploader.py index 168355b..28c0661 100644 --- a/uploader.py +++ b/uploader.py @@ -2,6 +2,7 @@ import hashlib import logging import re import time +import unicodedata from pathlib import Path from urllib.parse import quote @@ -78,8 +79,15 @@ class CalibreClient: # Fast path: check pre-loaded title index (available when sync pre-fetches all books) if self._existing_title_sets is not None: for their_words in self._existing_title_sets: + if not their_words: + continue overlap = len(our_words & their_words) - if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6): + # Match if: 3+ words in common, OR 60%+ of filename keywords match the title, + # OR 60%+ of the stored title's words appear in the filename keywords. + # The third condition catches short titles drowned out by filename noise. + if (overlap >= 3 + or (overlap / len(our_words) >= 0.6) + or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)): log.info("Duplicate (preloaded index): '%s'", filename) return True return False @@ -100,9 +108,13 @@ class CalibreClient: for title in calibre_titles: their_words = set(_normalize_words(title)) + if not their_words: + continue overlap = len(our_words & their_words) - if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6): - log.info("Duplicate (OPDS search): '%s' ~ '%s'", filename, title) + if (overlap >= 3 + or (overlap / len(our_words) >= 0.6) + or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)): + log.info("Duplicate (OPDS search): '%s'", filename) return True except Exception as e: log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e) @@ -143,6 +155,11 @@ class CalibreClient: log.info("Uploaded: %s", book_path.name) self._consecutive_failures = 0 db.record_book(book_path.name, file_hash, zip_source, "uploaded") + # Add to in-session index so a later zip with the same title is skipped + if self._existing_title_sets is not None: + kw = frozenset(_keywords_from_filename(book_path.name)) + if kw: + self._existing_title_sets.append(kw) return "uploaded" except requests.HTTPError: if resp.status_code in (502, 503, 504): @@ -347,24 +364,29 @@ def test_connection(cfg: CalibreConfig) -> tuple[bool, str]: # --- Helpers --- +def _ascii_fold(s: str) -> str: + """Strip accents: 'världens' → 'varldens', 'väg' → 'vag'.""" + return "".join(c for c in unicodedata.normalize("NFKD", s) if unicodedata.category(c) != "Mn") + + def _keywords_from_filename(filename: str) -> list[str]: """Extract meaningful words from a release-style filename for OPDS search.""" - stem = Path(filename).stem.lower() + stem = _ascii_fold(Path(filename).stem.lower()) stem = re.sub(r"[._\-]", " ", stem) stem = re.sub(r"[^\w\s]", "", stem) words = stem.split() return [ w for w in words if w not in _JUNK_WORDS - and not re.match(r"^\d{4}$", w) # strip year - and not re.match(r"^\d+$", w) # strip pure numbers + and not re.match(r"^\d{4}$", w) + and not re.match(r"^\d+$", w) and len(w) > 1 ] def _normalize_words(title: str) -> list[str]: """Normalize a Calibre-Web title for comparison.""" - title = title.lower() + title = _ascii_fold(title.lower()) title = re.sub(r"[^\w\s]", "", title) return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1]