check for doubles

This commit is contained in:
2026-05-12 16:23:13 +02:00
parent 7609bd926c
commit bfa09976b7
+29 -7
View File
@@ -2,6 +2,7 @@ import hashlib
import logging import logging
import re import re
import time import time
import unicodedata
from pathlib import Path from pathlib import Path
from urllib.parse import quote from urllib.parse import quote
@@ -78,8 +79,15 @@ class CalibreClient:
# Fast path: check pre-loaded title index (available when sync pre-fetches all books) # Fast path: check pre-loaded title index (available when sync pre-fetches all books)
if self._existing_title_sets is not None: if self._existing_title_sets is not None:
for their_words in self._existing_title_sets: for their_words in self._existing_title_sets:
if not their_words:
continue
overlap = len(our_words & their_words) overlap = len(our_words & their_words)
if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6): # Match if: 3+ words in common, OR 60%+ of filename keywords match the title,
# OR 60%+ of the stored title's words appear in the filename keywords.
# The third condition catches short titles drowned out by filename noise.
if (overlap >= 3
or (overlap / len(our_words) >= 0.6)
or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
log.info("Duplicate (preloaded index): '%s'", filename) log.info("Duplicate (preloaded index): '%s'", filename)
return True return True
return False return False
@@ -100,9 +108,13 @@ class CalibreClient:
for title in calibre_titles: for title in calibre_titles:
their_words = set(_normalize_words(title)) their_words = set(_normalize_words(title))
if not their_words:
continue
overlap = len(our_words & their_words) overlap = len(our_words & their_words)
if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6): if (overlap >= 3
log.info("Duplicate (OPDS search): '%s' ~ '%s'", filename, title) or (overlap / len(our_words) >= 0.6)
or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
log.info("Duplicate (OPDS search): '%s'", filename)
return True return True
except Exception as e: except Exception as e:
log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e) log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)
@@ -143,6 +155,11 @@ class CalibreClient:
log.info("Uploaded: %s", book_path.name) log.info("Uploaded: %s", book_path.name)
self._consecutive_failures = 0 self._consecutive_failures = 0
db.record_book(book_path.name, file_hash, zip_source, "uploaded") db.record_book(book_path.name, file_hash, zip_source, "uploaded")
# Add to in-session index so a later zip with the same title is skipped
if self._existing_title_sets is not None:
kw = frozenset(_keywords_from_filename(book_path.name))
if kw:
self._existing_title_sets.append(kw)
return "uploaded" return "uploaded"
except requests.HTTPError: except requests.HTTPError:
if resp.status_code in (502, 503, 504): if resp.status_code in (502, 503, 504):
@@ -347,24 +364,29 @@ def test_connection(cfg: CalibreConfig) -> tuple[bool, str]:
# --- Helpers --- # --- Helpers ---
def _ascii_fold(s: str) -> str:
"""Strip accents: 'världens''varldens', 'väg''vag'."""
return "".join(c for c in unicodedata.normalize("NFKD", s) if unicodedata.category(c) != "Mn")
def _keywords_from_filename(filename: str) -> list[str]: def _keywords_from_filename(filename: str) -> list[str]:
"""Extract meaningful words from a release-style filename for OPDS search.""" """Extract meaningful words from a release-style filename for OPDS search."""
stem = Path(filename).stem.lower() stem = _ascii_fold(Path(filename).stem.lower())
stem = re.sub(r"[._\-]", " ", stem) stem = re.sub(r"[._\-]", " ", stem)
stem = re.sub(r"[^\w\s]", "", stem) stem = re.sub(r"[^\w\s]", "", stem)
words = stem.split() words = stem.split()
return [ return [
w for w in words w for w in words
if w not in _JUNK_WORDS if w not in _JUNK_WORDS
and not re.match(r"^\d{4}$", w) # strip year and not re.match(r"^\d{4}$", w)
and not re.match(r"^\d+$", w) # strip pure numbers and not re.match(r"^\d+$", w)
and len(w) > 1 and len(w) > 1
] ]
def _normalize_words(title: str) -> list[str]: def _normalize_words(title: str) -> list[str]:
"""Normalize a Calibre-Web title for comparison.""" """Normalize a Calibre-Web title for comparison."""
title = title.lower() title = _ascii_fold(title.lower())
title = re.sub(r"[^\w\s]", "", title) title = re.sub(r"[^\w\s]", "", title)
return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1] return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1]