check for doubles
This commit is contained in:
+29
-7
@@ -2,6 +2,7 @@ import hashlib
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
import unicodedata
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
@@ -78,8 +79,15 @@ class CalibreClient:
|
|||||||
# Fast path: check pre-loaded title index (available when sync pre-fetches all books)
|
# Fast path: check pre-loaded title index (available when sync pre-fetches all books)
|
||||||
if self._existing_title_sets is not None:
|
if self._existing_title_sets is not None:
|
||||||
for their_words in self._existing_title_sets:
|
for their_words in self._existing_title_sets:
|
||||||
|
if not their_words:
|
||||||
|
continue
|
||||||
overlap = len(our_words & their_words)
|
overlap = len(our_words & their_words)
|
||||||
if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6):
|
# Match if: 3+ words in common, OR 60%+ of filename keywords match the title,
|
||||||
|
# OR 60%+ of the stored title's words appear in the filename keywords.
|
||||||
|
# The third condition catches short titles drowned out by filename noise.
|
||||||
|
if (overlap >= 3
|
||||||
|
or (overlap / len(our_words) >= 0.6)
|
||||||
|
or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
|
||||||
log.info("Duplicate (preloaded index): '%s'", filename)
|
log.info("Duplicate (preloaded index): '%s'", filename)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
@@ -100,9 +108,13 @@ class CalibreClient:
|
|||||||
|
|
||||||
for title in calibre_titles:
|
for title in calibre_titles:
|
||||||
their_words = set(_normalize_words(title))
|
their_words = set(_normalize_words(title))
|
||||||
|
if not their_words:
|
||||||
|
continue
|
||||||
overlap = len(our_words & their_words)
|
overlap = len(our_words & their_words)
|
||||||
if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6):
|
if (overlap >= 3
|
||||||
log.info("Duplicate (OPDS search): '%s' ~ '%s'", filename, title)
|
or (overlap / len(our_words) >= 0.6)
|
||||||
|
or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
|
||||||
|
log.info("Duplicate (OPDS search): '%s'", filename)
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)
|
log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)
|
||||||
@@ -143,6 +155,11 @@ class CalibreClient:
|
|||||||
log.info("Uploaded: %s", book_path.name)
|
log.info("Uploaded: %s", book_path.name)
|
||||||
self._consecutive_failures = 0
|
self._consecutive_failures = 0
|
||||||
db.record_book(book_path.name, file_hash, zip_source, "uploaded")
|
db.record_book(book_path.name, file_hash, zip_source, "uploaded")
|
||||||
|
# Add to in-session index so a later zip with the same title is skipped
|
||||||
|
if self._existing_title_sets is not None:
|
||||||
|
kw = frozenset(_keywords_from_filename(book_path.name))
|
||||||
|
if kw:
|
||||||
|
self._existing_title_sets.append(kw)
|
||||||
return "uploaded"
|
return "uploaded"
|
||||||
except requests.HTTPError:
|
except requests.HTTPError:
|
||||||
if resp.status_code in (502, 503, 504):
|
if resp.status_code in (502, 503, 504):
|
||||||
@@ -347,24 +364,29 @@ def test_connection(cfg: CalibreConfig) -> tuple[bool, str]:
|
|||||||
|
|
||||||
# --- Helpers ---
|
# --- Helpers ---
|
||||||
|
|
||||||
|
def _ascii_fold(s: str) -> str:
|
||||||
|
"""Strip accents: 'världens' → 'varldens', 'väg' → 'vag'."""
|
||||||
|
return "".join(c for c in unicodedata.normalize("NFKD", s) if unicodedata.category(c) != "Mn")
|
||||||
|
|
||||||
|
|
||||||
def _keywords_from_filename(filename: str) -> list[str]:
|
def _keywords_from_filename(filename: str) -> list[str]:
|
||||||
"""Extract meaningful words from a release-style filename for OPDS search."""
|
"""Extract meaningful words from a release-style filename for OPDS search."""
|
||||||
stem = Path(filename).stem.lower()
|
stem = _ascii_fold(Path(filename).stem.lower())
|
||||||
stem = re.sub(r"[._\-]", " ", stem)
|
stem = re.sub(r"[._\-]", " ", stem)
|
||||||
stem = re.sub(r"[^\w\s]", "", stem)
|
stem = re.sub(r"[^\w\s]", "", stem)
|
||||||
words = stem.split()
|
words = stem.split()
|
||||||
return [
|
return [
|
||||||
w for w in words
|
w for w in words
|
||||||
if w not in _JUNK_WORDS
|
if w not in _JUNK_WORDS
|
||||||
and not re.match(r"^\d{4}$", w) # strip year
|
and not re.match(r"^\d{4}$", w)
|
||||||
and not re.match(r"^\d+$", w) # strip pure numbers
|
and not re.match(r"^\d+$", w)
|
||||||
and len(w) > 1
|
and len(w) > 1
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def _normalize_words(title: str) -> list[str]:
|
def _normalize_words(title: str) -> list[str]:
|
||||||
"""Normalize a Calibre-Web title for comparison."""
|
"""Normalize a Calibre-Web title for comparison."""
|
||||||
title = title.lower()
|
title = _ascii_fold(title.lower())
|
||||||
title = re.sub(r"[^\w\s]", "", title)
|
title = re.sub(r"[^\w\s]", "", title)
|
||||||
return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1]
|
return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user