From c0e1cb068891279f8e00d312af6f94550832840a Mon Sep 17 00:00:00 2001 From: grymphen Date: Wed, 13 May 2026 18:24:55 +0200 Subject: [PATCH] cwa import --- config.py | 23 +-- db.py | 79 ++------ docker-compose.yml | 2 + main.py | 136 +------------ requirements.txt | 1 - sync.py | 63 ++---- templates/books.html | 46 ----- templates/duplicates.html | 124 ------------ templates/index.html | 18 +- templates/settings.html | 41 +--- uploader.py | 413 -------------------------------------- 11 files changed, 60 insertions(+), 886 deletions(-) delete mode 100644 templates/books.html delete mode 100644 templates/duplicates.html delete mode 100644 uploader.py diff --git a/config.py b/config.py index 315dd51..adfeca5 100644 --- a/config.py +++ b/config.py @@ -14,18 +14,11 @@ class SFTPConfig: remote_path: str = "" -@dataclass -class CalibreConfig: - url: str = "" - user: str = "" - password: str = "" - - @dataclass class AppConfig: sftp: SFTPConfig = field(default_factory=SFTPConfig) - calibre: CalibreConfig = field(default_factory=CalibreConfig) - local_work_dir: str = "/tmp/calibresync" + work_dir: str = "/tmp/calibresync" + import_dir: str = "" def load() -> AppConfig: @@ -40,12 +33,8 @@ def load() -> AppConfig: password=s.get("sftp_password", ""), remote_path=s.get("sftp_remote_path", ""), ), - calibre=CalibreConfig( - url=s.get("calibre_url", "").rstrip("/"), - user=s.get("calibre_user", ""), - password=s.get("calibre_pass", ""), - ), - local_work_dir=s.get("local_work_dir", "/tmp/calibresync"), + work_dir=s.get("work_dir", "/tmp/calibresync"), + import_dir=s.get("import_dir", ""), ) @@ -53,8 +42,8 @@ def save(form: dict) -> None: keys = [ "sftp_host", "sftp_port", "sftp_user", "sftp_auth_method", "sftp_password", "sftp_remote_path", - "calibre_url", "calibre_user", "calibre_pass", - "local_work_dir", "scheduler_interval_minutes", "sync_batch_size", + "work_dir", "import_dir", + "scheduler_interval_minutes", "sync_batch_size", ] for key in keys: if key in form and form[key] is not None: diff --git a/db.py b/db.py index e759dc7..fc85aa9 100644 --- a/db.py +++ b/db.py @@ -47,26 +47,16 @@ def init_db() -> None: error_msg TEXT ); - CREATE TABLE IF NOT EXISTS uploaded_books ( - id INTEGER PRIMARY KEY, - filename TEXT NOT NULL, - file_hash TEXT UNIQUE NOT NULL, - zip_source TEXT, - uploaded_at TEXT, - status TEXT - ); - CREATE TABLE IF NOT EXISTS sync_runs ( - id INTEGER PRIMARY KEY, - started_at TEXT NOT NULL, - finished_at TEXT, - zips_found INTEGER DEFAULT 0, - zips_new INTEGER DEFAULT 0, - books_uploaded INTEGER DEFAULT 0, - books_skipped INTEGER DEFAULT 0, + id INTEGER PRIMARY KEY, + started_at TEXT NOT NULL, + finished_at TEXT, + zips_found INTEGER DEFAULT 0, + zips_new INTEGER DEFAULT 0, + books_imported INTEGER DEFAULT 0, books_errored INTEGER DEFAULT 0, - status TEXT DEFAULT 'running', - error_msg TEXT + status TEXT DEFAULT 'running', + error_msg TEXT ); CREATE TABLE IF NOT EXISTS remote_zip_cache ( @@ -171,40 +161,6 @@ def get_recent_zips(limit: int = 50) -> list[sqlite3.Row]: ).fetchall() -# --- Uploaded books --- - -def is_book_uploaded(file_hash: str) -> bool: - with get_db() as conn: - row = conn.execute( - "SELECT id FROM uploaded_books WHERE file_hash = ? AND status IN ('uploaded', 'skipped_duplicate')", - (file_hash,), - ).fetchone() - return row is not None - - -def record_book(filename: str, file_hash: str, zip_source: str, status: str) -> None: - with get_db() as conn: - conn.execute( - """INSERT INTO uploaded_books (filename, file_hash, zip_source, uploaded_at, status) - VALUES (?, ?, ?, ?, ?) - ON CONFLICT(file_hash) DO UPDATE SET status = excluded.status""", - (filename, file_hash, zip_source, _now(), status), - ) - - -def get_books(limit: int = 200, offset: int = 0) -> list[sqlite3.Row]: - with get_db() as conn: - return conn.execute( - "SELECT * FROM uploaded_books ORDER BY uploaded_at DESC LIMIT ? OFFSET ?", - (limit, offset), - ).fetchall() - - -def get_books_count() -> int: - with get_db() as conn: - return conn.execute("SELECT COUNT(*) FROM uploaded_books").fetchone()[0] - - # --- Sync runs --- def start_sync_run() -> int: @@ -233,35 +189,28 @@ def get_recent_runs(limit: int = 10) -> list[sqlite3.Row]: def get_stats() -> dict: with get_db() as conn: - total_books = conn.execute("SELECT COUNT(*) FROM uploaded_books").fetchone()[0] - uploaded = conn.execute( - "SELECT COUNT(*) FROM uploaded_books WHERE status = 'uploaded'" - ).fetchone()[0] - skipped = conn.execute( - "SELECT COUNT(*) FROM uploaded_books WHERE status = 'skipped_duplicate'" - ).fetchone()[0] total_zips = conn.execute("SELECT COUNT(*) FROM processed_zips").fetchone()[0] + total_imported = conn.execute( + "SELECT COALESCE(SUM(books_imported), 0) FROM sync_runs" + ).fetchone()[0] last_run = conn.execute( "SELECT started_at, status FROM sync_runs ORDER BY started_at DESC LIMIT 1" ).fetchone() return { - "total_books": total_books, - "uploaded": uploaded, - "skipped": skipped, "total_zips": total_zips, + "total_imported": total_imported, "last_run": dict(last_run) if last_run else None, } def clear_sync_data() -> dict: - """Delete all processed_zips, uploaded_books, and sync_runs rows. Settings are kept. + """Delete all processed_zips and sync_runs rows. Settings are kept. Also resets the remote scan timestamp so the next sync does a full rescan.""" with get_db() as conn: zips = conn.execute("DELETE FROM processed_zips").rowcount - books = conn.execute("DELETE FROM uploaded_books").rowcount runs = conn.execute("DELETE FROM sync_runs").rowcount conn.execute("DELETE FROM settings WHERE key = 'remote_cache_last_scan'") - return {"zips": zips, "books": books, "runs": runs} + return {"zips": zips, "runs": runs} def _now() -> str: diff --git a/docker-compose.yml b/docker-compose.yml index 3b79528..e09f12a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,6 +6,8 @@ services: volumes: # Persists the SQLite database and settings across container restarts - ./data:/app/data + # CWA import folder — set the host path to match your CWA ingest directory + - /path/to/cwa-import:/cwa-import # Optional: mount your SSH private key read-only instead of pasting it in the UI # - ~/.ssh/id_rsa:/run/secrets/ssh_key:ro restart: unless-stopped diff --git a/main.py b/main.py index 9b70385..e2b15ae 100644 --- a/main.py +++ b/main.py @@ -12,8 +12,6 @@ import config import db import sftp as sftp_module import sync -import uploader -from uploader import CalibreClient, delete_book, fetch_all_books, find_duplicate_groups logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s — %(message)s") log = logging.getLogger(__name__) @@ -77,23 +75,6 @@ async def dashboard(request: Request): }) -# --- Books --- - -@app.get("/books", response_class=HTMLResponse) -async def books_page(request: Request, page: int = 1): - per_page = 50 - offset = (page - 1) * per_page - books = [dict(b) for b in db.get_books(limit=per_page, offset=offset)] - total = db.get_books_count() - pages = max(1, (total + per_page - 1) // per_page) - return templates.TemplateResponse(request, "books.html", { - "books": books, - "page": page, - "pages": pages, - "total": total, - }) - - # --- Settings --- @app.get("/settings", response_class=HTMLResponse) @@ -117,10 +98,8 @@ async def save_settings( sftp_key: str = Form(""), sftp_password: str = Form(""), sftp_remote_path: str = Form(""), - calibre_url: str = Form(""), - calibre_user: str = Form(""), - calibre_pass: str = Form(""), - local_work_dir: str = Form("/tmp/calibresync"), + work_dir: str = Form("/tmp/calibresync"), + import_dir: str = Form(""), scheduler_interval_minutes: str = Form("0"), sync_batch_size: str = Form("0"), ): @@ -132,10 +111,8 @@ async def save_settings( "sftp_key": sftp_key, "sftp_password": sftp_password, "sftp_remote_path": sftp_remote_path, - "calibre_url": calibre_url, - "calibre_user": calibre_user, - "calibre_pass": calibre_pass, - "local_work_dir": local_work_dir, + "work_dir": work_dir, + "import_dir": import_dir, "scheduler_interval_minutes": scheduler_interval_minutes, "sync_batch_size": sync_batch_size, }) @@ -179,111 +156,6 @@ async def test_ssh(): return {"ok": ok, "message": message} -@app.get("/api/test/calibre") -async def test_calibre(): - cfg = config.load() - ok, message = uploader.test_connection(cfg.calibre) - return {"ok": ok, "message": message} - - -# --- Duplicates --- - -@app.get("/duplicates", response_class=HTMLResponse) -async def duplicates_page(request: Request): - cfg = config.load() - error = None - groups: list = [] - total_books = 0 - try: - books = fetch_all_books(cfg.calibre) - total_books = len(books) - groups = find_duplicate_groups(books) - except Exception as e: - error = str(e) - return templates.TemplateResponse(request, "duplicates.html", { - "groups": groups, - "total_books": total_books, - "error": error, - }) - - -@app.post("/api/delete_book/{book_id}") -async def delete_book_api(book_id: int): - cfg = config.load() - ok, message = delete_book(cfg.calibre, book_id) - return {"ok": ok, "message": message} - - -_dedup_state: dict = {"running": False, "deleted": 0, "failed": 0, "total": 0, "done": False, "error": None} - - -def _run_dedup(): - global _dedup_state - try: - cfg = config.load() - log.info("Dedup: fetching all books ...") - client = CalibreClient(cfg.calibre) - client._ensure_auth() - books = fetch_all_books(cfg.calibre) - groups = find_duplicate_groups(books) - to_delete = [b for group in groups for b in sorted(group, key=lambda x: x.get("id", 0))[1:]] - _dedup_state.update({"total": len(to_delete), "deleted": 0, "failed": 0}) - log.info("Dedup: %d duplicate(s) to delete across %d group(s)", len(to_delete), len(groups)) - for book in to_delete: - ok, msg = delete_book(cfg.calibre, book["id"], client) - if ok: - _dedup_state["deleted"] += 1 - else: - _dedup_state["failed"] += 1 - log.warning("Dedup: failed to delete book %d: %s", book["id"], msg) - if _dedup_state["deleted"] % 10 == 0: - log.info("Dedup progress: %d / %d deleted", _dedup_state["deleted"], _dedup_state["total"]) - log.info("Dedup done: %d deleted, %d failed", _dedup_state["deleted"], _dedup_state["failed"]) - except Exception as e: - log.error("Dedup error: %s", e) - _dedup_state["error"] = str(e) - finally: - _dedup_state["running"] = False - _dedup_state["done"] = True - - -@app.post("/api/delete_duplicates") -async def delete_duplicates_api(background_tasks: BackgroundTasks): - if _dedup_state["running"]: - return {"ok": False, "message": "Already running"} - _dedup_state.update({"running": True, "deleted": 0, "failed": 0, "total": 0, "done": False, "error": None}) - background_tasks.add_task(_run_dedup) - return {"ok": True, "message": "Started"} - - -@app.get("/api/delete_duplicates/status") -async def delete_duplicates_status(): - return _dedup_state - - -@app.get("/api/debug/calibre_books") -async def debug_calibre_books(): - """Show raw Calibre-Web listbooks response shape so we can identify field names.""" - cfg = config.load() - from uploader import CalibreClient - client = CalibreClient(cfg.calibre) - client._ensure_auth() - resp = client._session.get( - f"{cfg.calibre.url}/ajax/listbooks", - params={"draw": 1, "start": 0, "length": 5, "sort": "title", "order": "asc"}, - timeout=30, - ) - data = resp.json() - non_list = {k: v for k, v in data.items() if not isinstance(v, list)} - list_keys = {k: len(v) for k, v in data.items() if isinstance(v, list)} - return { - "http_status": resp.status_code, - "top_level_keys": list(data.keys()), - "non_list_fields": non_list, - "list_fields_lengths": list_keys, - } - - # --- Data reset --- @app.post("/settings/reset-sync-data") diff --git a/requirements.txt b/requirements.txt index 305da9b..66e2534 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,4 @@ jinja2 python-multipart paramiko rarfile -requests apscheduler diff --git a/sync.py b/sync.py index 84564f3..dd54e24 100644 --- a/sync.py +++ b/sync.py @@ -1,4 +1,5 @@ import logging +import shutil import threading import time from pathlib import Path @@ -7,7 +8,6 @@ import config import db import extractor import sftp as sftp_module -from uploader import CalibreClient, CalibreUnavailableError log = logging.getLogger(__name__) @@ -33,22 +33,23 @@ def run_sync(limit: int | None = None) -> None: _running = True run_id = db.start_sync_run() - counters = dict(zips_found=0, zips_new=0, books_uploaded=0, books_skipped=0, books_errored=0) + counters = dict(zips_found=0, zips_new=0, books_imported=0, books_errored=0) try: log.info("Sync started (limit=%s)", limit) cfg = config.load() _validate_config(cfg) - log.info("Config OK — work dir: %s", cfg.local_work_dir) + log.info("Config OK — work dir: %s, import dir: %s", cfg.work_dir, cfg.import_dir) - work_dir = Path(cfg.local_work_dir) + work_dir = Path(cfg.work_dir) work_dir.mkdir(parents=True, exist_ok=True) - log.info("Work dir ready: %s", work_dir) + + import_dir = Path(cfg.import_dir) + import_dir.mkdir(parents=True, exist_ok=True) log.info("Connecting to SFTP %s@%s:%s ...", cfg.sftp.user, cfg.sftp.host, cfg.sftp.port) new_zips = sftp_module.list_new_zips(cfg.sftp, max_results=limit) counters["zips_found"] = len(new_zips) - counters["zips_new"] = len(new_zips) if not new_zips: @@ -56,21 +57,11 @@ def run_sync(limit: int | None = None) -> None: db.finish_sync_run(run_id, status="success", **counters) return - # Determine chunk size; 0 means process everything in one chunk batch_size = int(db.get_setting("sync_batch_size", "0") or "0") if batch_size <= 0: batch_size = len(new_zips) - total_batches = -(-len(new_zips) // batch_size) # ceiling division - client = CalibreClient(cfg.calibre) - - # Pre-load existing book titles so duplicate detection doesn't need per-book OPDS searches - try: - from uploader import fetch_all_books - existing = fetch_all_books(cfg.calibre) - client.preload_existing_titles(existing) - except Exception as exc: - log.warning("Could not pre-load existing books (%s) — will fall back to per-book OPDS search", exc) + total_batches = -(-len(new_zips) // batch_size) for batch_num, i in enumerate(range(0, len(new_zips), batch_size), start=1): chunk = new_zips[i : i + batch_size] @@ -89,34 +80,21 @@ def run_sync(limit: int | None = None) -> None: books = extractor.extract(local_zip, work_dir / "extracted") log.info("Extract done in %.1fs — %d book(s)", time.monotonic() - t1, len(books)) - books_errored_this_zip = 0 for book in books: - t2 = time.monotonic() - status = client.upload(book, zip_source=remote_zip.remote_path) - log.info("Upload '%s' → %s (%.1fs)", book.name, status, time.monotonic() - t2) - time.sleep(2) - if status == "uploaded": - counters["books_uploaded"] += 1 - elif status == "skipped_duplicate": - counters["books_skipped"] += 1 + dest = import_dir / book.name + if dest.exists(): + log.info("Skipping '%s' — already exists in import dir", book.name) else: - counters["books_errored"] += 1 - books_errored_this_zip += 1 - - if books_errored_this_zip: - zip_status = "error" - zip_error = f"{books_errored_this_zip} book upload(s) failed — will retry next sync" + shutil.move(str(book), str(dest)) + log.info("Moved '%s' → %s", book.name, import_dir) + counters["books_imported"] += 1 extractor.cleanup(work_dir / "extracted" / local_zip.stem) - except CalibreUnavailableError as e: - log.error("Calibre-Web unavailable — aborting sync run: %s", e) - db.mark_zip_processed(remote_zip.remote_path, remote_zip.file_size, "error", str(e)) - db.finish_sync_run(run_id, status="error", error_msg=str(e), **counters) - return except Exception as e: log.error("Error processing %s: %s", remote_zip.remote_path, e) zip_status = "error" zip_error = str(e) + counters["books_errored"] += 1 finally: if local_zip and local_zip.exists(): extractor.cleanup(local_zip) @@ -126,9 +104,8 @@ def run_sync(limit: int | None = None) -> None: db.finish_sync_run(run_id, status="success", **counters) log.info( - "Sync complete. Total zips: %d, Uploaded: %d, Skipped: %d, Errors: %d", - counters["zips_new"], counters["books_uploaded"], - counters["books_skipped"], counters["books_errored"], + "Sync complete. Total zips: %d, Imported: %d, Errors: %d", + counters["zips_new"], counters["books_imported"], counters["books_errored"], ) except Exception as e: log.exception("Sync run failed: %s", e) @@ -150,9 +127,7 @@ def _validate_config(cfg) -> None: missing.append("SSH private key") if cfg.sftp.auth_method == "password" and not cfg.sftp.password: missing.append("SSH password") - if not cfg.calibre.url: - missing.append("Calibre-Web URL") - if not cfg.calibre.user: - missing.append("Calibre-Web username") + if not cfg.import_dir: + missing.append("CWA import folder") if missing: raise ValueError(f"Missing configuration: {', '.join(missing)}") diff --git a/templates/books.html b/templates/books.html deleted file mode 100644 index 1546e49..0000000 --- a/templates/books.html +++ /dev/null @@ -1,46 +0,0 @@ -{% extends "base.html" %} -{% block title %}Books — CalibreSync{% endblock %} - -{% block content %} - - -{% if books %} - - - - - - - - - - - {% for b in books %} - - - - - - - {% endfor %} - -
FilenameStatusSource zipUploaded
{{ b.filename }}{{ b.status }}{{ b.zip_source or "—" }}{{ b.uploaded_at[:19].replace("T"," ") if b.uploaded_at else "—" }}
- -{% if pages > 1 %} - -{% endif %} - -{% else %} -

No books recorded yet.

-{% endif %} -{% endblock %} diff --git a/templates/duplicates.html b/templates/duplicates.html deleted file mode 100644 index 09afc6e..0000000 --- a/templates/duplicates.html +++ /dev/null @@ -1,124 +0,0 @@ -{% extends "base.html" %} -{% block title %}Duplicates — CalibreSync{% endblock %} - -{% block content %} - - - - -{% if error %} -
Could not fetch books from Calibre-Web: {{ error }}
-{% else %} -

- Scanned {{ total_books }} book(s) — - {% if groups %} - found {{ groups|length }} duplicate group(s) (same title + author). - The oldest copy (lowest ID) is kept when deleting all. - {% else %} - no duplicates found. - {% endif %} -

- - {% for group in groups %} -
-

{{ group[0].title }}

- - - - - - - - - - - - {% for book in group %} - - - - - - - - {% endfor %} - -
IDTitleAuthorsFormat
{{ book.id }}{{ book.title }}{{ book.authors }}{{ book.format or "—" }} - - -
-
- {% endfor %} -{% endif %} - - -{% endblock %} diff --git a/templates/index.html b/templates/index.html index baf41e5..59535d3 100644 --- a/templates/index.html +++ b/templates/index.html @@ -59,16 +59,8 @@
Zip archives processed
-
{{ stats.uploaded }}
-
Books uploaded
-
-
-
{{ stats.skipped }}
-
Duplicates skipped
-
-
-
{{ stats.total_books }}
-
Total book records
+
{{ stats.total_imported }}
+
Books imported
@@ -81,8 +73,7 @@ Finished Status New zips - Uploaded - Skipped + Imported Errors @@ -93,8 +84,7 @@ {{ r.finished_at[:19].replace("T"," ") if r.finished_at else "—" }} {{ r.status }} {{ r.zips_new }} - {{ r.books_uploaded }} - {{ r.books_skipped }} + {{ r.books_imported }} {{ r.books_errored }} {% endfor %} diff --git a/templates/settings.html b/templates/settings.html index 8f29c94..12f654b 100644 --- a/templates/settings.html +++ b/templates/settings.html @@ -84,39 +84,20 @@ -
-

Calibre-Web

- -
- - -
- -
- - -
- -
- - -
- -
- -

-
-
-

Local

- - + + +

Folder watched by Calibre-Web-Automated. Extracted epub/pdf files are moved here flat.

+
+ +
+ +

Temporary storage for downloaded zips and extracted files. Cleaned up after each run.

@@ -182,7 +163,7 @@ async function testConn(type, btn) { result.className = "test-result test-fail"; } finally { btn.disabled = false; - btn.textContent = type === "ssh" ? "Test SSH connection" : "Test Calibre-Web connection"; + btn.textContent = "Test SSH connection"; } } diff --git a/uploader.py b/uploader.py deleted file mode 100644 index 28c0661..0000000 --- a/uploader.py +++ /dev/null @@ -1,413 +0,0 @@ -import hashlib -import logging -import re -import time -import unicodedata -from pathlib import Path -from urllib.parse import quote - -import requests - -import db -from config import CalibreConfig - -log = logging.getLogger(__name__) - -MIME_TYPES = { - ".epub": "application/epub+zip", - ".pdf": "application/pdf", -} - -# Words stripped before comparing titles — release-group tags, language codes, format names, etc. -_JUNK_WORDS = { - "retail", "epub", "ebook", "pdf", "mobi", "azw3", "decipher", - "swedish", "english", "danish", "norwegian", "finnish", "german", "french", - "the", "a", "an", "och", "und", "les", "der", "die", "das", -} - - -class CalibreUnavailableError(RuntimeError): - """Raised when Calibre-Web returns repeated 502/503/504 — sync run should abort.""" - - -class CalibreClient: - def __init__(self, cfg: CalibreConfig): - self._cfg = cfg - self._session = requests.Session() - self._authenticated = False - self._upload_csrf: str | None = None - self._consecutive_failures = 0 - # Pre-loaded title word-sets for fast duplicate detection (set by preload_existing_titles) - self._existing_title_sets: list[frozenset[str]] | None = None - - def preload_existing_titles(self, books: list[dict]) -> None: - """Build an in-memory index of normalised title keywords from a pre-fetched book list.""" - self._existing_title_sets = [ - frozenset(_normalize_words(b.get("title", ""))) - for b in books - if b.get("title") - ] - log.info("Pre-loaded %d existing book titles for duplicate detection", len(self._existing_title_sets)) - - def _ensure_auth(self) -> None: - if self._authenticated: - return - login_url = f"{self._cfg.url}/login" - page = self._session.get(login_url, timeout=30) - page.raise_for_status() - csrf = _extract_csrf(page.text) - - data = {"username": self._cfg.user, "password": self._cfg.password} - if csrf: - data["csrf_token"] = csrf - - resp = self._session.post(login_url, data=data, allow_redirects=True, timeout=30) - resp.raise_for_status() - if resp.url.rstrip("/").endswith("/login"): - raise RuntimeError("Calibre-Web authentication failed — check credentials") - self._authenticated = True - self._upload_csrf = _extract_csrf(resp.text) or csrf - log.info("Authenticated to Calibre-Web at %s", self._cfg.url) - - def _exists_in_calibre(self, filename: str) -> bool: - """Check whether a book already exists in Calibre-Web. Returns True if likely duplicate.""" - keywords = _keywords_from_filename(filename) - if len(keywords) < 2: - return False - our_words = set(keywords) - - # Fast path: check pre-loaded title index (available when sync pre-fetches all books) - if self._existing_title_sets is not None: - for their_words in self._existing_title_sets: - if not their_words: - continue - overlap = len(our_words & their_words) - # Match if: 3+ words in common, OR 60%+ of filename keywords match the title, - # OR 60%+ of the stored title's words appear in the filename keywords. - # The third condition catches short titles drowned out by filename noise. - if (overlap >= 3 - or (overlap / len(our_words) >= 0.6) - or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)): - log.info("Duplicate (preloaded index): '%s'", filename) - return True - return False - - # Slow path fallback: OPDS search (used when no index is available) - query = " ".join(keywords[:6]) - try: - resp = self._session.get( - f"{self._cfg.url}/opds/search/{quote(query, safe='')}", - auth=(self._cfg.user, self._cfg.password), - timeout=15, - ) - if resp.status_code == 404: - return False - calibre_titles = _parse_opds_titles(resp.text) - if not calibre_titles: - return False - - for title in calibre_titles: - their_words = set(_normalize_words(title)) - if not their_words: - continue - overlap = len(our_words & their_words) - if (overlap >= 3 - or (overlap / len(our_words) >= 0.6) - or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)): - log.info("Duplicate (OPDS search): '%s'", filename) - return True - except Exception as e: - log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e) - return False - - def upload(self, book_path: Path, zip_source: str) -> str: - """Upload a book file. Returns status: 'uploaded' | 'skipped_duplicate' | 'error'.""" - file_hash = _sha256(book_path) - - # Primary guard: hash already in our DB - if db.is_book_uploaded(file_hash): - log.info("Skipping (already uploaded): %s", book_path.name) - db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate") - return "skipped_duplicate" - - try: - self._ensure_auth() - - # Secondary guard: title search in Calibre-Web (catches pre-existing books) - if self._exists_in_calibre(book_path.name): - log.info("Skipping (exists in Calibre-Web): %s", book_path.name) - db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate") - return "skipped_duplicate" - - mime = MIME_TYPES.get(book_path.suffix.lower(), "application/octet-stream") - for attempt in range(1, 4): - try: - with book_path.open("rb") as fh: - resp = self._session.post( - f"{self._cfg.url}/upload", - files={"btn-upload": (book_path.name, fh, mime)}, - data={"csrf_token": self._upload_csrf} if self._upload_csrf else {}, - timeout=120, - ) - if not resp.ok: - log.error("Upload HTTP %s (attempt %d/3) — body: %s", resp.status_code, attempt, resp.text[:300]) - resp.raise_for_status() - log.info("Uploaded: %s", book_path.name) - self._consecutive_failures = 0 - db.record_book(book_path.name, file_hash, zip_source, "uploaded") - # Add to in-session index so a later zip with the same title is skipped - if self._existing_title_sets is not None: - kw = frozenset(_keywords_from_filename(book_path.name)) - if kw: - self._existing_title_sets.append(kw) - return "uploaded" - except requests.HTTPError: - if resp.status_code in (502, 503, 504): - if attempt < 3: - log.warning("HTTP %s on attempt %d/3 — retrying in 180s ...", resp.status_code, attempt) - time.sleep(180) - continue - # All retries exhausted - self._consecutive_failures += 1 - if self._consecutive_failures >= 3: - raise CalibreUnavailableError( - f"Calibre-Web returned {resp.status_code} on {self._consecutive_failures} " - "consecutive books — aborting sync run" - ) - break - if resp.status_code == 400 and attempt == 1: - log.warning("HTTP 400 — CSRF token likely expired, re-authenticating ...") - self._authenticated = False - self._upload_csrf = None - self._ensure_auth() - continue - break - - db.record_book(book_path.name, file_hash, zip_source, "error") - return "error" - except CalibreUnavailableError: - db.record_book(book_path.name, file_hash, zip_source, "error") - raise - except Exception as e: - log.error("Upload failed for %s: %s", book_path.name, e) - db.record_book(book_path.name, file_hash, zip_source, "error") - return "error" - - -def fetch_all_books(cfg: CalibreConfig) -> list[dict]: - """Fetch every book from Calibre-Web. Tries /ajax/listbooks first; falls back to OPDS if pagination is broken.""" - client = CalibreClient(cfg) - client._ensure_auth() - all_books: list[dict] = [] - seen_ids: set = set() - page_size = 1000 - start = 0 - reported_total = 0 - while True: - resp = client._session.get( - f"{cfg.url}/ajax/listbooks", - params={ - "draw": 1, - "start": start, "length": page_size, - "iDisplayStart": start, "iDisplayLength": page_size, - }, - timeout=60, - ) - resp.raise_for_status() - data = resp.json() - if start == 0: - non_list = {k: v for k, v in data.items() if not isinstance(v, list)} - log.info("listbooks page-0 meta fields: %s", non_list) - rows = data.get("rows") or data.get("data") or [] - reported_total = ( - data.get("recordsTotal") or data.get("total_count") or - data.get("total") or data.get("totalNotFiltered") or 0 - ) - new_in_page = 0 - for b in rows: - bid = b.get("id") - if bid not in seen_ids: - seen_ids.add(bid) - all_books.append(b) - new_in_page += 1 - log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), reported_total, new_in_page) - if not rows or new_in_page == 0 or len(all_books) >= reported_total: - break - start += len(rows) - - # If we got far fewer books than reported, listbooks pagination is broken — use OPDS instead - if reported_total > 0 and len(all_books) < reported_total // 2: - log.warning( - "listbooks pagination broken (%d/%d books retrieved). Falling back to OPDS.", - len(all_books), reported_total, - ) - return _fetch_all_books_opds(cfg) - return all_books - - -def _fetch_all_books_opds(cfg: CalibreConfig) -> list[dict]: - """Fetch all books via OPDS catalog, following next-page links.""" - import xml.etree.ElementTree as ET - books: list[dict] = [] - seen_ids: set = set() - url: str | None = f"{cfg.url}/opds/new" - auth = (cfg.user, cfg.password) - session = requests.Session() - - while url: - resp = session.get(url, auth=auth, timeout=30) - if not resp.ok: - log.warning("OPDS fetch failed HTTP %s — %s", resp.status_code, url) - break - try: - root = ET.fromstring(resp.content) - except ET.ParseError as exc: - log.warning("OPDS XML parse error: %s", exc) - break - - next_url: str | None = None - entries_this_page = 0 - for elem in root: - local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag - if local == "link" and elem.get("rel") == "next": - href = elem.get("href", "") - next_url = href if href.startswith("http") else f"{cfg.url}{href}" - elif local == "entry": - entries_this_page += 1 - title = "" - author_parts: list[str] = [] - book_id: int | None = None - for child in elem: - ctag = child.tag.split("}")[-1] if "}" in child.tag else child.tag - if ctag == "title": - title = child.text or "" - elif ctag == "author": - for gc in child: - if (gc.tag.split("}")[-1] if "}" in gc.tag else gc.tag) == "name": - author_parts.append(gc.text or "") - elif ctag == "link": - m = re.search(r"/download/(\d+)/", child.get("href", "")) - if m and book_id is None: - book_id = int(m.group(1)) - if book_id and book_id not in seen_ids: - seen_ids.add(book_id) - books.append({"id": book_id, "title": title, "authors": " & ".join(author_parts)}) - - log.info("OPDS fetched: %d books total (page had %d entries)", len(books), entries_this_page) - if not entries_this_page: - break - url = next_url - - return books - - -def delete_book(cfg: CalibreConfig, book_id: int, client: "CalibreClient | None" = None) -> tuple[bool, str]: - """Delete a book from Calibre-Web by ID. Pass a pre-authenticated client to avoid re-auth overhead.""" - if client is None: - client = CalibreClient(cfg) - client._ensure_auth() - csrf = client._upload_csrf - if not csrf: - # Try to fetch a CSRF token from the book detail page - try: - page = client._session.get(f"{cfg.url}/book/{book_id}", timeout=15) - csrf = _extract_csrf(page.text) - client._upload_csrf = csrf - except Exception: - pass - for attempt in range(2): - resp = client._session.post( - f"{cfg.url}/delete/{book_id}", - data={"csrf_token": csrf} if csrf else {}, - timeout=30, - ) - if resp.ok: - return True, "Deleted" - if resp.status_code == 400 and attempt == 0: - # CSRF token likely expired; re-authenticate and retry once - log.info("delete_book: 400 on book %d — refreshing CSRF and retrying", book_id) - client._authenticated = False - client._upload_csrf = None - client._ensure_auth() - csrf = client._upload_csrf - continue - return False, f"HTTP {resp.status_code}" - return False, "HTTP 400 after re-auth retry" - - -def find_duplicate_groups(books: list[dict]) -> list[list[dict]]: - """Group books by normalised title+author; return only groups with 2+ entries.""" - from collections import defaultdict - groups: dict[str, list[dict]] = defaultdict(list) - for book in books: - title = re.sub(r"[^\w\s]", " ", book.get("title", "").lower()) - title = re.sub(r"\s+", " ", title).strip() - authors = re.sub(r"[^\w\s]", " ", book.get("authors", "").lower()) - authors = re.sub(r"\s+", " ", authors).strip() - key = f"{title}||{authors}" - if title: - groups[key].append(book) - return sorted( - [g for g in groups.values() if len(g) > 1], - key=lambda g: g[0].get("title", "").lower(), - ) - - -def test_connection(cfg: CalibreConfig) -> tuple[bool, str]: - try: - client = CalibreClient(cfg) - client._ensure_auth() - return True, f"Authenticated to {cfg.url} as '{cfg.user}'." - except Exception as e: - return False, str(e) - - -# --- Helpers --- - -def _ascii_fold(s: str) -> str: - """Strip accents: 'världens' → 'varldens', 'väg' → 'vag'.""" - return "".join(c for c in unicodedata.normalize("NFKD", s) if unicodedata.category(c) != "Mn") - - -def _keywords_from_filename(filename: str) -> list[str]: - """Extract meaningful words from a release-style filename for OPDS search.""" - stem = _ascii_fold(Path(filename).stem.lower()) - stem = re.sub(r"[._\-]", " ", stem) - stem = re.sub(r"[^\w\s]", "", stem) - words = stem.split() - return [ - w for w in words - if w not in _JUNK_WORDS - and not re.match(r"^\d{4}$", w) - and not re.match(r"^\d+$", w) - and len(w) > 1 - ] - - -def _normalize_words(title: str) -> list[str]: - """Normalize a Calibre-Web title for comparison.""" - title = _ascii_fold(title.lower()) - title = re.sub(r"[^\w\s]", "", title) - return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1] - - -def _parse_opds_titles(xml: str) -> list[str]: - """Extract book titles from an OPDS Atom feed, skipping the feed title itself.""" - # Grab all elements; the first is the feed title ("Search results"), rest are books - titles = re.findall(r"<title>([^<]+)", xml) - return titles[1:] if len(titles) > 1 else [] - - -def _extract_csrf(html: str) -> str | None: - m = re.search(r'name="csrf_token"\s+value="([^"]+)"', html) - if not m: - m = re.search(r'value="([^"]+)"\s+name="csrf_token"', html) - return m.group(1) if m else None - - -def _sha256(path: Path) -> str: - h = hashlib.sha256() - with path.open("rb") as f: - for chunk in iter(lambda: f.read(65536), b""): - h.update(chunk) - return h.hexdigest()