diff --git a/main.py b/main.py index e4701b3..a8677dc 100644 --- a/main.py +++ b/main.py @@ -14,8 +14,7 @@ import sftp as sftp_module import sync import uploader -logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(name)s — %(message)s") -logging.getLogger("paramiko").setLevel(logging.INFO) # paramiko debug is too noisy +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s — %(message)s") log = logging.getLogger(__name__) _scheduler = BackgroundScheduler(timezone="UTC") diff --git a/uploader.py b/uploader.py index 9d4b25c..4bc7314 100644 --- a/uploader.py +++ b/uploader.py @@ -2,6 +2,7 @@ import hashlib import logging import re from pathlib import Path +from urllib.parse import quote import requests @@ -15,6 +16,13 @@ MIME_TYPES = { ".pdf": "application/pdf", } +# Words stripped before comparing titles — release-group tags, language codes, format names, etc. +_JUNK_WORDS = { + "retail", "epub", "ebook", "pdf", "mobi", "azw3", "decipher", + "swedish", "english", "danish", "norwegian", "finnish", "german", "french", + "the", "a", "an", "och", "und", "les", "der", "die", "das", +} + class CalibreClient: def __init__(self, cfg: CalibreConfig): @@ -26,7 +34,6 @@ class CalibreClient: def _ensure_auth(self) -> None: if self._authenticated: return - # Fetch login page first to get the CSRF token (Flask-WTF requirement) login_url = f"{self._cfg.url}/login" page = self._session.get(login_url, timeout=30) page.raise_for_status() @@ -36,36 +43,63 @@ class CalibreClient: if csrf: data["csrf_token"] = csrf - resp = self._session.post( - login_url, - data=data, - allow_redirects=True, - timeout=30, - ) + resp = self._session.post(login_url, data=data, allow_redirects=True, timeout=30) resp.raise_for_status() - # Calibre-Web redirects to / on success; landing back on /login means bad creds if resp.url.rstrip("/").endswith("/login"): raise RuntimeError("Calibre-Web authentication failed — check credentials") self._authenticated = True - # The CSRF token is session-scoped in Flask-WTF — reuse the login token for uploads. - # Also try to extract a fresh one from the landing page (/). self._upload_csrf = _extract_csrf(resp.text) or csrf - log.debug("Upload CSRF token from login: %s", - self._upload_csrf[:12] + "…" if self._upload_csrf else "NOT FOUND") log.info("Authenticated to Calibre-Web at %s", self._cfg.url) + def _exists_in_calibre(self, filename: str) -> bool: + """Search Calibre-Web OPDS for a title matching this filename. Returns True if likely duplicate.""" + keywords = _keywords_from_filename(filename) + if len(keywords) < 2: + return False + + query = " ".join(keywords[:6]) + try: + resp = self._session.get( + f"{self._cfg.url}/opds/search/{quote(query, safe='')}", + timeout=15, + ) + if resp.status_code == 404: + return False + calibre_titles = _parse_opds_titles(resp.text) + if not calibre_titles: + return False + + our_words = set(keywords) + for title in calibre_titles: + their_words = set(_normalize_words(title)) + overlap = len(our_words & their_words) + # Match if 3+ words in common, or 60%+ of our keywords match + if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6): + log.info("Duplicate found in Calibre-Web: '%s' ~ '%s'", filename, title) + return True + except Exception as e: + log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e) + return False + def upload(self, book_path: Path, zip_source: str) -> str: """Upload a book file. Returns status: 'uploaded' | 'skipped_duplicate' | 'error'.""" file_hash = _sha256(book_path) + # Primary guard: hash already in our DB if db.is_book_uploaded(file_hash): - log.info("Skipping duplicate: %s (hash %s)", book_path.name, file_hash[:8]) + log.info("Skipping (already uploaded): %s", book_path.name) db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate") return "skipped_duplicate" try: self._ensure_auth() + # Secondary guard: title search in Calibre-Web (catches pre-existing books) + if self._exists_in_calibre(book_path.name): + log.info("Skipping (exists in Calibre-Web): %s", book_path.name) + db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate") + return "skipped_duplicate" + mime = MIME_TYPES.get(book_path.suffix.lower(), "application/octet-stream") with book_path.open("rb") as fh: resp = self._session.post( @@ -75,7 +109,7 @@ class CalibreClient: timeout=120, ) if not resp.ok: - log.error("Upload HTTP %s — response body: %s", resp.status_code, resp.text[:500]) + log.error("Upload HTTP %s — body: %s", resp.status_code, resp.text[:300]) resp.raise_for_status() log.info("Uploaded: %s", book_path.name) db.record_book(book_path.name, file_hash, zip_source, "uploaded") @@ -98,6 +132,37 @@ def test_connection(cfg: CalibreConfig) -> tuple[bool, str]: return False, str(e) +# --- Helpers --- + +def _keywords_from_filename(filename: str) -> list[str]: + """Extract meaningful words from a release-style filename for OPDS search.""" + stem = Path(filename).stem.lower() + stem = re.sub(r"[._\-]", " ", stem) + stem = re.sub(r"[^\w\s]", "", stem) + words = stem.split() + return [ + w for w in words + if w not in _JUNK_WORDS + and not re.match(r"^\d{4}$", w) # strip year + and not re.match(r"^\d+$", w) # strip pure numbers + and len(w) > 1 + ] + + +def _normalize_words(title: str) -> list[str]: + """Normalize a Calibre-Web title for comparison.""" + title = title.lower() + title = re.sub(r"[^\w\s]", "", title) + return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1] + + +def _parse_opds_titles(xml: str) -> list[str]: + """Extract book titles from an OPDS Atom feed, skipping the feed title itself.""" + # Grab all