From bfa09976b7415082f84fb64c38cce191bd886c5f Mon Sep 17 00:00:00 2001
From: grymphen <grymphen@gmail.com>
Date: Tue, 12 May 2026 16:23:13 +0200
Subject: [PATCH] check for doubles

---
 uploader.py | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/uploader.py b/uploader.py
index 168355b..28c0661 100644
--- a/uploader.py
+++ b/uploader.py
@@ -2,6 +2,7 @@ import hashlib
 import logging
 import re
 import time
+import unicodedata
 from pathlib import Path
 from urllib.parse import quote
 
@@ -78,8 +79,15 @@ class CalibreClient:
         # Fast path: check pre-loaded title index (available when sync pre-fetches all books)
         if self._existing_title_sets is not None:
             for their_words in self._existing_title_sets:
+                if not their_words:
+                    continue
                 overlap = len(our_words & their_words)
-                if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6):
+                # Match if: 3+ words in common, OR 60%+ of filename keywords match the title,
+                # OR 60%+ of the stored title's words appear in the filename keywords.
+                # The third condition catches short titles drowned out by filename noise.
+                if (overlap >= 3
+                        or (overlap / len(our_words) >= 0.6)
+                        or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
                     log.info("Duplicate (preloaded index): '%s'", filename)
                     return True
             return False
@@ -100,9 +108,13 @@ class CalibreClient:
 
             for title in calibre_titles:
                 their_words = set(_normalize_words(title))
+                if not their_words:
+                    continue
                 overlap = len(our_words & their_words)
-                if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6):
-                    log.info("Duplicate (OPDS search): '%s' ~ '%s'", filename, title)
+                if (overlap >= 3
+                        or (overlap / len(our_words) >= 0.6)
+                        or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
+                    log.info("Duplicate (OPDS search): '%s'", filename)
                     return True
         except Exception as e:
             log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)
@@ -143,6 +155,11 @@ class CalibreClient:
                     log.info("Uploaded: %s", book_path.name)
                     self._consecutive_failures = 0
                     db.record_book(book_path.name, file_hash, zip_source, "uploaded")
+                    # Add to in-session index so a later zip with the same title is skipped
+                    if self._existing_title_sets is not None:
+                        kw = frozenset(_keywords_from_filename(book_path.name))
+                        if kw:
+                            self._existing_title_sets.append(kw)
                     return "uploaded"
                 except requests.HTTPError:
                     if resp.status_code in (502, 503, 504):
@@ -347,24 +364,29 @@ def test_connection(cfg: CalibreConfig) -> tuple[bool, str]:
 
 # --- Helpers ---
 
+def _ascii_fold(s: str) -> str:
+    """Strip accents: 'världens' → 'varldens', 'väg' → 'vag'."""
+    return "".join(c for c in unicodedata.normalize("NFKD", s) if unicodedata.category(c) != "Mn")
+
+
 def _keywords_from_filename(filename: str) -> list[str]:
     """Extract meaningful words from a release-style filename for OPDS search."""
-    stem = Path(filename).stem.lower()
+    stem = _ascii_fold(Path(filename).stem.lower())
     stem = re.sub(r"[._\-]", " ", stem)
     stem = re.sub(r"[^\w\s]", "", stem)
     words = stem.split()
     return [
         w for w in words
         if w not in _JUNK_WORDS
-        and not re.match(r"^\d{4}$", w)   # strip year
-        and not re.match(r"^\d+$", w)      # strip pure numbers
+        and not re.match(r"^\d{4}$", w)
+        and not re.match(r"^\d+$", w)
         and len(w) > 1
     ]
 
 
 def _normalize_words(title: str) -> list[str]:
     """Normalize a Calibre-Web title for comparison."""
-    title = title.lower()
+    title = _ascii_fold(title.lower())
     title = re.sub(r"[^\w\s]", "", title)
     return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1]