cwa import

2026-05-13 18:24:55 +02:00
parent bfa09976b7
commit c0e1cb0688
11 changed files with 60 additions and 886 deletions
@@ -14,18 +14,11 @@ class SFTPConfig:
    remote_path: str = ""
@dataclass
 class CalibreConfig:
    url: str = ""
    user: str = ""
    password: str = ""
@dataclass
 class AppConfig:
    sftp: SFTPConfig = field(default_factory=SFTPConfig)
-    calibre: CalibreConfig = field(default_factory=CalibreConfig)
+    work_dir: str = "/tmp/calibresync"
-    local_work_dir: str = "/tmp/calibresync"
+    import_dir: str = ""
 def load() -> AppConfig:
@@ -40,12 +33,8 @@ def load() -> AppConfig:
            password=s.get("sftp_password", ""),
            remote_path=s.get("sftp_remote_path", ""),
        ),
-        calibre=CalibreConfig(
+        work_dir=s.get("work_dir", "/tmp/calibresync"),
-            url=s.get("calibre_url", "").rstrip("/"),
+        import_dir=s.get("import_dir", ""),
            user=s.get("calibre_user", ""),
            password=s.get("calibre_pass", ""),
        ),
        local_work_dir=s.get("local_work_dir", "/tmp/calibresync"),
    )
@@ -53,8 +42,8 @@ def save(form: dict) -> None:
    keys = [
        "sftp_host", "sftp_port", "sftp_user", "sftp_auth_method",
        "sftp_password", "sftp_remote_path",
-        "calibre_url", "calibre_user", "calibre_pass",
+        "work_dir", "import_dir",
-        "local_work_dir", "scheduler_interval_minutes", "sync_batch_size",
+        "scheduler_interval_minutes", "sync_batch_size",
    ]
    for key in keys:
        if key in form and form[key] is not None:
@@ -47,23 +47,13 @@ def init_db() -> None:
                error_msg    TEXT
            );
            CREATE TABLE IF NOT EXISTS uploaded_books (
                id           INTEGER PRIMARY KEY,
                filename     TEXT NOT NULL,
                file_hash    TEXT UNIQUE NOT NULL,
                zip_source   TEXT,
                uploaded_at  TEXT,
                status       TEXT
            );
            CREATE TABLE IF NOT EXISTS sync_runs (
                id            INTEGER PRIMARY KEY,
                started_at    TEXT NOT NULL,
                finished_at   TEXT,
                zips_found    INTEGER DEFAULT 0,
                zips_new      INTEGER DEFAULT 0,
-                books_uploaded INTEGER DEFAULT 0,
+                books_imported INTEGER DEFAULT 0,
                books_skipped  INTEGER DEFAULT 0,
                books_errored  INTEGER DEFAULT 0,
                status        TEXT DEFAULT 'running',
                error_msg     TEXT
@@ -171,40 +161,6 @@ def get_recent_zips(limit: int = 50) -> list[sqlite3.Row]:
        ).fetchall()
 # --- Uploaded books ---
 def is_book_uploaded(file_hash: str) -> bool:
    with get_db() as conn:
        row = conn.execute(
            "SELECT id FROM uploaded_books WHERE file_hash = ? AND status IN ('uploaded', 'skipped_duplicate')",
            (file_hash,),
        ).fetchone()
    return row is not None
 def record_book(filename: str, file_hash: str, zip_source: str, status: str) -> None:
    with get_db() as conn:
        conn.execute(
            """INSERT INTO uploaded_books (filename, file_hash, zip_source, uploaded_at, status)
               VALUES (?, ?, ?, ?, ?)
               ON CONFLICT(file_hash) DO UPDATE SET status = excluded.status""",
            (filename, file_hash, zip_source, _now(), status),
        )
 def get_books(limit: int = 200, offset: int = 0) -> list[sqlite3.Row]:
    with get_db() as conn:
        return conn.execute(
            "SELECT * FROM uploaded_books ORDER BY uploaded_at DESC LIMIT ? OFFSET ?",
            (limit, offset),
        ).fetchall()
 def get_books_count() -> int:
    with get_db() as conn:
        return conn.execute("SELECT COUNT(*) FROM uploaded_books").fetchone()[0]
 # --- Sync runs ---
 def start_sync_run() -> int:
@@ -233,35 +189,28 @@ def get_recent_runs(limit: int = 10) -> list[sqlite3.Row]:
 def get_stats() -> dict:
    with get_db() as conn:
        total_books = conn.execute("SELECT COUNT(*) FROM uploaded_books").fetchone()[0]
        uploaded = conn.execute(
            "SELECT COUNT(*) FROM uploaded_books WHERE status = 'uploaded'"
        ).fetchone()[0]
        skipped = conn.execute(
            "SELECT COUNT(*) FROM uploaded_books WHERE status = 'skipped_duplicate'"
        ).fetchone()[0]
        total_zips = conn.execute("SELECT COUNT(*) FROM processed_zips").fetchone()[0]
        total_imported = conn.execute(
            "SELECT COALESCE(SUM(books_imported), 0) FROM sync_runs"
        ).fetchone()[0]
        last_run = conn.execute(
            "SELECT started_at, status FROM sync_runs ORDER BY started_at DESC LIMIT 1"
        ).fetchone()
    return {
        "total_books": total_books,
        "uploaded": uploaded,
        "skipped": skipped,
        "total_zips": total_zips,
        "total_imported": total_imported,
        "last_run": dict(last_run) if last_run else None,
    }
 def clear_sync_data() -> dict:
-    """Delete all processed_zips, uploaded_books, and sync_runs rows. Settings are kept.
+    """Delete all processed_zips and sync_runs rows. Settings are kept.
    Also resets the remote scan timestamp so the next sync does a full rescan."""
    with get_db() as conn:
        zips = conn.execute("DELETE FROM processed_zips").rowcount
        books = conn.execute("DELETE FROM uploaded_books").rowcount
        runs = conn.execute("DELETE FROM sync_runs").rowcount
        conn.execute("DELETE FROM settings WHERE key = 'remote_cache_last_scan'")
-    return {"zips": zips, "books": books, "runs": runs}
+    return {"zips": zips, "runs": runs}
 def _now() -> str:
@@ -6,6 +6,8 @@ services:
    volumes:
      # Persists the SQLite database and settings across container restarts
      - ./data:/app/data
      # CWA import folder — set the host path to match your CWA ingest directory
      - /path/to/cwa-import:/cwa-import
      # Optional: mount your SSH private key read-only instead of pasting it in the UI
      # - ~/.ssh/id_rsa:/run/secrets/ssh_key:ro
    restart: unless-stopped
@@ -12,8 +12,6 @@ import config
 import db
 import sftp as sftp_module
 import sync
 import uploader
 from uploader import CalibreClient, delete_book, fetch_all_books, find_duplicate_groups
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s — %(message)s")
 log = logging.getLogger(__name__)
@@ -77,23 +75,6 @@ async def dashboard(request: Request):
    })
 # --- Books ---
@app.get("/books", response_class=HTMLResponse)
 async def books_page(request: Request, page: int = 1):
    per_page = 50
    offset = (page - 1) * per_page
    books = [dict(b) for b in db.get_books(limit=per_page, offset=offset)]
    total = db.get_books_count()
    pages = max(1, (total + per_page - 1) // per_page)
    return templates.TemplateResponse(request, "books.html", {
        "books": books,
        "page": page,
        "pages": pages,
        "total": total,
    })
 # --- Settings ---
@app.get("/settings", response_class=HTMLResponse)
@@ -117,10 +98,8 @@ async def save_settings(
    sftp_key: str = Form(""),
    sftp_password: str = Form(""),
    sftp_remote_path: str = Form(""),
-    calibre_url: str = Form(""),
+    work_dir: str = Form("/tmp/calibresync"),
-    calibre_user: str = Form(""),
+    import_dir: str = Form(""),
    calibre_pass: str = Form(""),
    local_work_dir: str = Form("/tmp/calibresync"),
    scheduler_interval_minutes: str = Form("0"),
    sync_batch_size: str = Form("0"),
 ):
@@ -132,10 +111,8 @@ async def save_settings(
        "sftp_key": sftp_key,
        "sftp_password": sftp_password,
        "sftp_remote_path": sftp_remote_path,
-        "calibre_url": calibre_url,
+        "work_dir": work_dir,
-        "calibre_user": calibre_user,
+        "import_dir": import_dir,
        "calibre_pass": calibre_pass,
        "local_work_dir": local_work_dir,
        "scheduler_interval_minutes": scheduler_interval_minutes,
        "sync_batch_size": sync_batch_size,
    })
@@ -179,111 +156,6 @@ async def test_ssh():
    return {"ok": ok, "message": message}
@app.get("/api/test/calibre")
 async def test_calibre():
    cfg = config.load()
    ok, message = uploader.test_connection(cfg.calibre)
    return {"ok": ok, "message": message}
 # --- Duplicates ---
@app.get("/duplicates", response_class=HTMLResponse)
 async def duplicates_page(request: Request):
    cfg = config.load()
    error = None
    groups: list = []
    total_books = 0
    try:
        books = fetch_all_books(cfg.calibre)
        total_books = len(books)
        groups = find_duplicate_groups(books)
    except Exception as e:
        error = str(e)
    return templates.TemplateResponse(request, "duplicates.html", {
        "groups": groups,
        "total_books": total_books,
        "error": error,
    })
@app.post("/api/delete_book/{book_id}")
 async def delete_book_api(book_id: int):
    cfg = config.load()
    ok, message = delete_book(cfg.calibre, book_id)
    return {"ok": ok, "message": message}
 _dedup_state: dict = {"running": False, "deleted": 0, "failed": 0, "total": 0, "done": False, "error": None}
 def _run_dedup():
    global _dedup_state
    try:
        cfg = config.load()
        log.info("Dedup: fetching all books ...")
        client = CalibreClient(cfg.calibre)
        client._ensure_auth()
        books = fetch_all_books(cfg.calibre)
        groups = find_duplicate_groups(books)
        to_delete = [b for group in groups for b in sorted(group, key=lambda x: x.get("id", 0))[1:]]
        _dedup_state.update({"total": len(to_delete), "deleted": 0, "failed": 0})
        log.info("Dedup: %d duplicate(s) to delete across %d group(s)", len(to_delete), len(groups))
        for book in to_delete:
            ok, msg = delete_book(cfg.calibre, book["id"], client)
            if ok:
                _dedup_state["deleted"] += 1
            else:
                _dedup_state["failed"] += 1
                log.warning("Dedup: failed to delete book %d: %s", book["id"], msg)
            if _dedup_state["deleted"] % 10 == 0:
                log.info("Dedup progress: %d / %d deleted", _dedup_state["deleted"], _dedup_state["total"])
        log.info("Dedup done: %d deleted, %d failed", _dedup_state["deleted"], _dedup_state["failed"])
    except Exception as e:
        log.error("Dedup error: %s", e)
        _dedup_state["error"] = str(e)
    finally:
        _dedup_state["running"] = False
        _dedup_state["done"] = True
@app.post("/api/delete_duplicates")
 async def delete_duplicates_api(background_tasks: BackgroundTasks):
    if _dedup_state["running"]:
        return {"ok": False, "message": "Already running"}
    _dedup_state.update({"running": True, "deleted": 0, "failed": 0, "total": 0, "done": False, "error": None})
    background_tasks.add_task(_run_dedup)
    return {"ok": True, "message": "Started"}
@app.get("/api/delete_duplicates/status")
 async def delete_duplicates_status():
    return _dedup_state
@app.get("/api/debug/calibre_books")
 async def debug_calibre_books():
    """Show raw Calibre-Web listbooks response shape so we can identify field names."""
    cfg = config.load()
    from uploader import CalibreClient
    client = CalibreClient(cfg.calibre)
    client._ensure_auth()
    resp = client._session.get(
        f"{cfg.calibre.url}/ajax/listbooks",
        params={"draw": 1, "start": 0, "length": 5, "sort": "title", "order": "asc"},
        timeout=30,
    )
    data = resp.json()
    non_list = {k: v for k, v in data.items() if not isinstance(v, list)}
    list_keys = {k: len(v) for k, v in data.items() if isinstance(v, list)}
    return {
        "http_status": resp.status_code,
        "top_level_keys": list(data.keys()),
        "non_list_fields": non_list,
        "list_fields_lengths": list_keys,
    }
 # --- Data reset ---
@app.post("/settings/reset-sync-data")
@@ -4,5 +4,4 @@ jinja2
 python-multipart
 paramiko
 rarfile
 requests
 apscheduler
@@ -1,4 +1,5 @@
 import logging
 import shutil
 import threading
 import time
 from pathlib import Path
@@ -7,7 +8,6 @@ import config
 import db
 import extractor
 import sftp as sftp_module
 from uploader import CalibreClient, CalibreUnavailableError
 log = logging.getLogger(__name__)
@@ -33,22 +33,23 @@ def run_sync(limit: int | None = None) -> None:
    _running = True
    run_id = db.start_sync_run()
-    counters = dict(zips_found=0, zips_new=0, books_uploaded=0, books_skipped=0, books_errored=0)
+    counters = dict(zips_found=0, zips_new=0, books_imported=0, books_errored=0)
    try:
        log.info("Sync started (limit=%s)", limit)
        cfg = config.load()
        _validate_config(cfg)
-        log.info("Config OK — work dir: %s", cfg.local_work_dir)
+        log.info("Config OK — work dir: %s, import dir: %s", cfg.work_dir, cfg.import_dir)
-        work_dir = Path(cfg.local_work_dir)
+        work_dir = Path(cfg.work_dir)
        work_dir.mkdir(parents=True, exist_ok=True)
-        log.info("Work dir ready: %s", work_dir)
+
        import_dir = Path(cfg.import_dir)
        import_dir.mkdir(parents=True, exist_ok=True)
        log.info("Connecting to SFTP %s@%s:%s ...", cfg.sftp.user, cfg.sftp.host, cfg.sftp.port)
        new_zips = sftp_module.list_new_zips(cfg.sftp, max_results=limit)
        counters["zips_found"] = len(new_zips)
        counters["zips_new"] = len(new_zips)
        if not new_zips:
@@ -56,21 +57,11 @@ def run_sync(limit: int | None = None) -> None:
            db.finish_sync_run(run_id, status="success", **counters)
            return
        # Determine chunk size; 0 means process everything in one chunk
        batch_size = int(db.get_setting("sync_batch_size", "0") or "0")
        if batch_size <= 0:
            batch_size = len(new_zips)
-        total_batches = -(-len(new_zips) // batch_size)  # ceiling division
+        total_batches = -(-len(new_zips) // batch_size)
        client = CalibreClient(cfg.calibre)
        # Pre-load existing book titles so duplicate detection doesn't need per-book OPDS searches
        try:
            from uploader import fetch_all_books
            existing = fetch_all_books(cfg.calibre)
            client.preload_existing_titles(existing)
        except Exception as exc:
            log.warning("Could not pre-load existing books (%s) — will fall back to per-book OPDS search", exc)
        for batch_num, i in enumerate(range(0, len(new_zips), batch_size), start=1):
            chunk = new_zips[i : i + batch_size]
@@ -89,34 +80,21 @@ def run_sync(limit: int | None = None) -> None:
                    books = extractor.extract(local_zip, work_dir / "extracted")
                    log.info("Extract done in %.1fs — %d book(s)", time.monotonic() - t1, len(books))
                    books_errored_this_zip = 0
                    for book in books:
-                        t2 = time.monotonic()
+                        dest = import_dir / book.name
-                        status = client.upload(book, zip_source=remote_zip.remote_path)
+                        if dest.exists():
-                        log.info("Upload '%s' → %s (%.1fs)", book.name, status, time.monotonic() - t2)
+                            log.info("Skipping '%s' — already exists in import dir", book.name)
                        time.sleep(2)
                        if status == "uploaded":
                            counters["books_uploaded"] += 1
                        elif status == "skipped_duplicate":
                            counters["books_skipped"] += 1
                        else:
-                            counters["books_errored"] += 1
+                            shutil.move(str(book), str(dest))
-                            books_errored_this_zip += 1
+                            log.info("Moved '%s' → %s", book.name, import_dir)
-
+                            counters["books_imported"] += 1
                    if books_errored_this_zip:
                        zip_status = "error"
                        zip_error = f"{books_errored_this_zip} book upload(s) failed — will retry next sync"
                    extractor.cleanup(work_dir / "extracted" / local_zip.stem)
                except CalibreUnavailableError as e:
                    log.error("Calibre-Web unavailable — aborting sync run: %s", e)
                    db.mark_zip_processed(remote_zip.remote_path, remote_zip.file_size, "error", str(e))
                    db.finish_sync_run(run_id, status="error", error_msg=str(e), **counters)
                    return
                except Exception as e:
                    log.error("Error processing %s: %s", remote_zip.remote_path, e)
                    zip_status = "error"
                    zip_error = str(e)
                    counters["books_errored"] += 1
                finally:
                    if local_zip and local_zip.exists():
                        extractor.cleanup(local_zip)
@@ -126,9 +104,8 @@ def run_sync(limit: int | None = None) -> None:
        db.finish_sync_run(run_id, status="success", **counters)
        log.info(
-            "Sync complete. Total zips: %d, Uploaded: %d, Skipped: %d, Errors: %d",
+            "Sync complete. Total zips: %d, Imported: %d, Errors: %d",
-            counters["zips_new"], counters["books_uploaded"],
+            counters["zips_new"], counters["books_imported"], counters["books_errored"],
            counters["books_skipped"], counters["books_errored"],
        )
    except Exception as e:
        log.exception("Sync run failed: %s", e)
@@ -150,9 +127,7 @@ def _validate_config(cfg) -> None:
        missing.append("SSH private key")
    if cfg.sftp.auth_method == "password" and not cfg.sftp.password:
        missing.append("SSH password")
-    if not cfg.calibre.url:
+    if not cfg.import_dir:
-        missing.append("Calibre-Web URL")
+        missing.append("CWA import folder")
    if not cfg.calibre.user:
        missing.append("Calibre-Web username")
    if missing:
        raise ValueError(f"Missing configuration: {', '.join(missing)}")
@@ -1,46 +0,0 @@
 {% extends "base.html" %}
 {% block title %}Books — CalibreSync{% endblock %}
 {% block content %}
 <div class="page-header">
  <h1>Books <span class="muted">({{ total }})</span></h1>
 </div>
 {% if books %}
 <table>
  <thead>
    <tr>
      <th>Filename</th>
      <th>Status</th>
      <th>Source zip</th>
      <th>Uploaded</th>
    </tr>
  </thead>
  <tbody>
    {% for b in books %}
    <tr>
      <td>{{ b.filename }}</td>
      <td><span class="badge badge-{{ b.status }}">{{ b.status }}</span></td>
      <td class="mono small muted">{{ b.zip_source or "—" }}</td>
      <td>{{ b.uploaded_at[:19].replace("T"," ") if b.uploaded_at else "—" }}</td>
    </tr>
    {% endfor %}
  </tbody>
 </table>
 {% if pages > 1 %}
 <div class="pagination">
  {% if page > 1 %}
    <a href="/books?page={{ page - 1 }}">&laquo; Prev</a>
  {% endif %}
  <span>Page {{ page }} of {{ pages }}</span>
  {% if page < pages %}
    <a href="/books?page={{ page + 1 }}">Next &raquo;</a>
  {% endif %}
 </div>
 {% endif %}
 {% else %}
  <p class="muted">No books recorded yet.</p>
 {% endif %}
 {% endblock %}
@@ -1,124 +0,0 @@
 {% extends "base.html" %}
 {% block title %}Duplicates — CalibreSync{% endblock %}
 {% block content %}
 <div class="page-header">
  <h1>Duplicate books in Calibre-Web</h1>
  {% if groups %}
  <div class="header-actions">
    <button class="btn btn-danger" onclick="deleteAll(this)">Delete all duplicates (keep oldest)</button>
  </div>
  {% endif %}
 </div>
 <div id="dedup-progress" style="display:none" class="alert alert-success"></div>
 {% if error %}
  <div class="alert alert-warning">Could not fetch books from Calibre-Web: {{ error }}</div>
 {% else %}
  <p class="muted small" style="margin-bottom:1.5rem">
    Scanned <strong>{{ total_books }}</strong> book(s) —
    {% if groups %}
      found <strong>{{ groups|length }}</strong> duplicate group(s) (same title + author).
      The oldest copy (lowest ID) is kept when deleting all.
    {% else %}
      no duplicates found.
    {% endif %}
  </p>
  {% for group in groups %}
  <div class="form-section" style="margin-bottom:1rem">
    <h3 style="margin-top:0">{{ group[0].title }}</h3>
    <table>
      <thead>
        <tr>
          <th>ID</th>
          <th>Title</th>
          <th>Authors</th>
          <th>Format</th>
          <th></th>
        </tr>
      </thead>
      <tbody>
        {% for book in group %}
        <tr id="row-{{ book.id }}">
          <td class="mono muted">{{ book.id }}</td>
          <td>{{ book.title }}</td>
          <td>{{ book.authors }}</td>
          <td>{{ book.format or "—" }}</td>
          <td>
            <button class="btn btn-danger" style="padding:0.2rem 0.7rem;font-size:0.85rem"
                    onclick="deleteBook({{ book.id }}, this)">Delete</button>
            <span id="status-{{ book.id }}" class="muted small" style="margin-left:0.5rem"></span>
          </td>
        </tr>
        {% endfor %}
      </tbody>
    </table>
  </div>
  {% endfor %}
 {% endif %}
 <script>
 async function deleteBook(id, btn) {
  if (!confirm('Delete book ID ' + id + ' from Calibre-Web?')) return;
  btn.disabled = true;
  btn.textContent = 'Deleting…';
  const status = document.getElementById('status-' + id);
  try {
    const r = await fetch('/api/delete_book/' + id, {method: 'POST'});
    const data = await r.json();
    if (data.ok) {
      document.getElementById('row-' + id).style.opacity = '0.35';
      btn.textContent = 'Deleted';
      status.textContent = '✓';
    } else {
      btn.disabled = false;
      btn.textContent = 'Delete';
      status.textContent = 'Failed: ' + data.message;
      status.style.color = 'var(--error, #f87171)';
    }
  } catch (e) {
    btn.disabled = false;
    btn.textContent = 'Delete';
    status.textContent = 'Error: ' + e;
  }
 }
 async function deleteAll(btn) {
  if (!confirm('Delete all duplicates from Calibre-Web, keeping the oldest copy of each title+author? This cannot be undone.')) return;
  btn.disabled = true;
  btn.textContent = 'Starting…';
  const progress = document.getElementById('dedup-progress');
  progress.style.display = '';
  progress.textContent = 'Fetching book list from Calibre-Web…';
  await fetch('/api/delete_duplicates', {method: 'POST'});
  const poll = setInterval(async () => {
    const r = await fetch('/api/delete_duplicates/status');
    const s = await r.json();
    if (s.error) {
      clearInterval(poll);
      progress.textContent = 'Error: ' + s.error;
      progress.className = 'alert alert-warning';
      btn.disabled = false;
      btn.textContent = 'Delete all duplicates (keep oldest)';
      return;
    }
    if (s.total > 0) {
      progress.textContent = `Deleting… ${s.deleted} / ${s.total} deleted, ${s.failed} failed`;
    } else {
      progress.textContent = 'Scanning for duplicates…';
    }
    if (s.done) {
      clearInterval(poll);
      progress.textContent = `Done — ${s.deleted} book(s) deleted, ${s.failed} failed. Reload to refresh the list.`;
      btn.textContent = 'Reload';
      btn.disabled = false;
      btn.onclick = () => location.reload();
    }
  }, 2000);
 }
 </script>
 {% endblock %}
@@ -59,16 +59,8 @@
    <div class="stat-label">Zip archives processed</div>
  </div>
  <div class="stat-card">
-    <div class="stat-value">{{ stats.uploaded }}</div>
+    <div class="stat-value">{{ stats.total_imported }}</div>
-    <div class="stat-label">Books uploaded</div>
+    <div class="stat-label">Books imported</div>
  </div>
  <div class="stat-card">
    <div class="stat-value">{{ stats.skipped }}</div>
    <div class="stat-label">Duplicates skipped</div>
  </div>
  <div class="stat-card">
    <div class="stat-value">{{ stats.total_books }}</div>
    <div class="stat-label">Total book records</div>
  </div>
 </div>
@@ -81,8 +73,7 @@
      <th>Finished</th>
      <th>Status</th>
      <th>New zips</th>
-      <th>Uploaded</th>
+      <th>Imported</th>
      <th>Skipped</th>
      <th>Errors</th>
    </tr>
  </thead>
@@ -93,8 +84,7 @@
      <td>{{ r.finished_at[:19].replace("T"," ") if r.finished_at else "—" }}</td>
      <td><span class="badge badge-{{ r.status }}">{{ r.status }}</span></td>
      <td>{{ r.zips_new }}</td>
-      <td>{{ r.books_uploaded }}</td>
+      <td>{{ r.books_imported }}</td>
      <td>{{ r.books_skipped }}</td>
      <td>{{ r.books_errored }}</td>
    </tr>
    {% endfor %}
@@ -84,39 +84,20 @@
    </div>
  </section>
  <section class="form-section">
    <h2>Calibre-Web</h2>
    <div class="form-row">
      <label for="calibre_url">URL</label>
      <input id="calibre_url" name="calibre_url" type="url" placeholder="http://localhost:8083"
             value="{{ s.get('calibre_url','') }}">
    </div>
    <div class="form-row">
      <label for="calibre_user">Username</label>
      <input id="calibre_user" name="calibre_user" type="text" value="{{ s.get('calibre_user','') }}">
    </div>
    <div class="form-row">
      <label for="calibre_pass">Password</label>
      <input id="calibre_pass" name="calibre_pass" type="password"
             value="{{ s.get('calibre_pass','') }}">
    </div>
    <div class="form-row">
      <button type="button" class="btn btn-secondary" onclick="testConn('calibre', this)">Test Calibre-Web connection</button>
      <p id="test-calibre-result" class="test-result"></p>
    </div>
  </section>
  <section class="form-section">
    <h2>Local</h2>
    <div class="form-row">
-      <label for="local_work_dir">Work directory</label>
+      <label for="import_dir">CWA import folder</label>
-      <input id="local_work_dir" name="local_work_dir" type="text" placeholder="/tmp/calibresync"
+      <input id="import_dir" name="import_dir" type="text" placeholder="/mnt/cwa-import"
-             value="{{ s.get('local_work_dir','/tmp/calibresync') }}">
+             value="{{ s.get('import_dir','') }}">
      <p class="muted small">Folder watched by Calibre-Web-Automated. Extracted epub/pdf files are moved here flat.</p>
    </div>
    <div class="form-row">
      <label for="work_dir">Temp work directory</label>
      <input id="work_dir" name="work_dir" type="text" placeholder="/tmp/calibresync"
             value="{{ s.get('work_dir','/tmp/calibresync') }}">
      <p class="muted small">Temporary storage for downloaded zips and extracted files. Cleaned up after each run.</p>
    </div>
  </section>
@@ -182,7 +163,7 @@ async function testConn(type, btn) {
    result.className = "test-result test-fail";
  } finally {
    btn.disabled = false;
-    btn.textContent = type === "ssh" ? "Test SSH connection" : "Test Calibre-Web connection";
+    btn.textContent = "Test SSH connection";
  }
 }
 </script>
@@ -1,413 +0,0 @@
 import hashlib
 import logging
 import re
 import time
 import unicodedata
 from pathlib import Path
 from urllib.parse import quote
 import requests
 import db
 from config import CalibreConfig
 log = logging.getLogger(__name__)
 MIME_TYPES = {
    ".epub": "application/epub+zip",
    ".pdf": "application/pdf",
 }
 # Words stripped before comparing titles — release-group tags, language codes, format names, etc.
 _JUNK_WORDS = {
    "retail", "epub", "ebook", "pdf", "mobi", "azw3", "decipher",
    "swedish", "english", "danish", "norwegian", "finnish", "german", "french",
    "the", "a", "an", "och", "und", "les", "der", "die", "das",
 }
 class CalibreUnavailableError(RuntimeError):
    """Raised when Calibre-Web returns repeated 502/503/504 — sync run should abort."""
 class CalibreClient:
    def __init__(self, cfg: CalibreConfig):
        self._cfg = cfg
        self._session = requests.Session()
        self._authenticated = False
        self._upload_csrf: str | None = None
        self._consecutive_failures = 0
        # Pre-loaded title word-sets for fast duplicate detection (set by preload_existing_titles)
        self._existing_title_sets: list[frozenset[str]] | None = None
    def preload_existing_titles(self, books: list[dict]) -> None:
        """Build an in-memory index of normalised title keywords from a pre-fetched book list."""
        self._existing_title_sets = [
            frozenset(_normalize_words(b.get("title", "")))
            for b in books
            if b.get("title")
        ]
        log.info("Pre-loaded %d existing book titles for duplicate detection", len(self._existing_title_sets))
    def _ensure_auth(self) -> None:
        if self._authenticated:
            return
        login_url = f"{self._cfg.url}/login"
        page = self._session.get(login_url, timeout=30)
        page.raise_for_status()
        csrf = _extract_csrf(page.text)
        data = {"username": self._cfg.user, "password": self._cfg.password}
        if csrf:
            data["csrf_token"] = csrf
        resp = self._session.post(login_url, data=data, allow_redirects=True, timeout=30)
        resp.raise_for_status()
        if resp.url.rstrip("/").endswith("/login"):
            raise RuntimeError("Calibre-Web authentication failed — check credentials")
        self._authenticated = True
        self._upload_csrf = _extract_csrf(resp.text) or csrf
        log.info("Authenticated to Calibre-Web at %s", self._cfg.url)
    def _exists_in_calibre(self, filename: str) -> bool:
        """Check whether a book already exists in Calibre-Web. Returns True if likely duplicate."""
        keywords = _keywords_from_filename(filename)
        if len(keywords) < 2:
            return False
        our_words = set(keywords)
        # Fast path: check pre-loaded title index (available when sync pre-fetches all books)
        if self._existing_title_sets is not None:
            for their_words in self._existing_title_sets:
                if not their_words:
                    continue
                overlap = len(our_words & their_words)
                # Match if: 3+ words in common, OR 60%+ of filename keywords match the title,
                # OR 60%+ of the stored title's words appear in the filename keywords.
                # The third condition catches short titles drowned out by filename noise.
                if (overlap >= 3
                        or (overlap / len(our_words) >= 0.6)
                        or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
                    log.info("Duplicate (preloaded index): '%s'", filename)
                    return True
            return False
        # Slow path fallback: OPDS search (used when no index is available)
        query = " ".join(keywords[:6])
        try:
            resp = self._session.get(
                f"{self._cfg.url}/opds/search/{quote(query, safe='')}",
                auth=(self._cfg.user, self._cfg.password),
                timeout=15,
            )
            if resp.status_code == 404:
                return False
            calibre_titles = _parse_opds_titles(resp.text)
            if not calibre_titles:
                return False
            for title in calibre_titles:
                their_words = set(_normalize_words(title))
                if not their_words:
                    continue
                overlap = len(our_words & their_words)
                if (overlap >= 3
                        or (overlap / len(our_words) >= 0.6)
                        or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
                    log.info("Duplicate (OPDS search): '%s'", filename)
                    return True
        except Exception as e:
            log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)
        return False
    def upload(self, book_path: Path, zip_source: str) -> str:
        """Upload a book file. Returns status: 'uploaded' | 'skipped_duplicate' | 'error'."""
        file_hash = _sha256(book_path)
        # Primary guard: hash already in our DB
        if db.is_book_uploaded(file_hash):
            log.info("Skipping (already uploaded): %s", book_path.name)
            db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate")
            return "skipped_duplicate"
        try:
            self._ensure_auth()
            # Secondary guard: title search in Calibre-Web (catches pre-existing books)
            if self._exists_in_calibre(book_path.name):
                log.info("Skipping (exists in Calibre-Web): %s", book_path.name)
                db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate")
                return "skipped_duplicate"
            mime = MIME_TYPES.get(book_path.suffix.lower(), "application/octet-stream")
            for attempt in range(1, 4):
                try:
                    with book_path.open("rb") as fh:
                        resp = self._session.post(
                            f"{self._cfg.url}/upload",
                            files={"btn-upload": (book_path.name, fh, mime)},
                            data={"csrf_token": self._upload_csrf} if self._upload_csrf else {},
                            timeout=120,
                        )
                    if not resp.ok:
                        log.error("Upload HTTP %s (attempt %d/3) — body: %s", resp.status_code, attempt, resp.text[:300])
                    resp.raise_for_status()
                    log.info("Uploaded: %s", book_path.name)
                    self._consecutive_failures = 0
                    db.record_book(book_path.name, file_hash, zip_source, "uploaded")
                    # Add to in-session index so a later zip with the same title is skipped
                    if self._existing_title_sets is not None:
                        kw = frozenset(_keywords_from_filename(book_path.name))
                        if kw:
                            self._existing_title_sets.append(kw)
                    return "uploaded"
                except requests.HTTPError:
                    if resp.status_code in (502, 503, 504):
                        if attempt < 3:
                            log.warning("HTTP %s on attempt %d/3 — retrying in 180s ...", resp.status_code, attempt)
                            time.sleep(180)
                            continue
                        # All retries exhausted
                        self._consecutive_failures += 1
                        if self._consecutive_failures >= 3:
                            raise CalibreUnavailableError(
                                f"Calibre-Web returned {resp.status_code} on {self._consecutive_failures} "
                                "consecutive books — aborting sync run"
                            )
                        break
                    if resp.status_code == 400 and attempt == 1:
                        log.warning("HTTP 400 — CSRF token likely expired, re-authenticating ...")
                        self._authenticated = False
                        self._upload_csrf = None
                        self._ensure_auth()
                        continue
                    break
            db.record_book(book_path.name, file_hash, zip_source, "error")
            return "error"
        except CalibreUnavailableError:
            db.record_book(book_path.name, file_hash, zip_source, "error")
            raise
        except Exception as e:
            log.error("Upload failed for %s: %s", book_path.name, e)
            db.record_book(book_path.name, file_hash, zip_source, "error")
            return "error"
 def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
    """Fetch every book from Calibre-Web. Tries /ajax/listbooks first; falls back to OPDS if pagination is broken."""
    client = CalibreClient(cfg)
    client._ensure_auth()
    all_books: list[dict] = []
    seen_ids: set = set()
    page_size = 1000
    start = 0
    reported_total = 0
    while True:
        resp = client._session.get(
            f"{cfg.url}/ajax/listbooks",
            params={
                "draw": 1,
                "start": start, "length": page_size,
                "iDisplayStart": start, "iDisplayLength": page_size,
            },
            timeout=60,
        )
        resp.raise_for_status()
        data = resp.json()
        if start == 0:
            non_list = {k: v for k, v in data.items() if not isinstance(v, list)}
            log.info("listbooks page-0 meta fields: %s", non_list)
        rows = data.get("rows") or data.get("data") or []
        reported_total = (
            data.get("recordsTotal") or data.get("total_count") or
            data.get("total") or data.get("totalNotFiltered") or 0
        )
        new_in_page = 0
        for b in rows:
            bid = b.get("id")
            if bid not in seen_ids:
                seen_ids.add(bid)
                all_books.append(b)
                new_in_page += 1
        log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), reported_total, new_in_page)
        if not rows or new_in_page == 0 or len(all_books) >= reported_total:
            break
        start += len(rows)
    # If we got far fewer books than reported, listbooks pagination is broken — use OPDS instead
    if reported_total > 0 and len(all_books) < reported_total // 2:
        log.warning(
            "listbooks pagination broken (%d/%d books retrieved). Falling back to OPDS.",
            len(all_books), reported_total,
        )
        return _fetch_all_books_opds(cfg)
    return all_books
 def _fetch_all_books_opds(cfg: CalibreConfig) -> list[dict]:
    """Fetch all books via OPDS catalog, following next-page links."""
    import xml.etree.ElementTree as ET
    books: list[dict] = []
    seen_ids: set = set()
    url: str | None = f"{cfg.url}/opds/new"
    auth = (cfg.user, cfg.password)
    session = requests.Session()
    while url:
        resp = session.get(url, auth=auth, timeout=30)
        if not resp.ok:
            log.warning("OPDS fetch failed HTTP %s — %s", resp.status_code, url)
            break
        try:
            root = ET.fromstring(resp.content)
        except ET.ParseError as exc:
            log.warning("OPDS XML parse error: %s", exc)
            break
        next_url: str | None = None
        entries_this_page = 0
        for elem in root:
            local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
            if local == "link" and elem.get("rel") == "next":
                href = elem.get("href", "")
                next_url = href if href.startswith("http") else f"{cfg.url}{href}"
            elif local == "entry":
                entries_this_page += 1
                title = ""
                author_parts: list[str] = []
                book_id: int | None = None
                for child in elem:
                    ctag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
                    if ctag == "title":
                        title = child.text or ""
                    elif ctag == "author":
                        for gc in child:
                            if (gc.tag.split("}")[-1] if "}" in gc.tag else gc.tag) == "name":
                                author_parts.append(gc.text or "")
                    elif ctag == "link":
                        m = re.search(r"/download/(\d+)/", child.get("href", ""))
                        if m and book_id is None:
                            book_id = int(m.group(1))
                if book_id and book_id not in seen_ids:
                    seen_ids.add(book_id)
                    books.append({"id": book_id, "title": title, "authors": " & ".join(author_parts)})
        log.info("OPDS fetched: %d books total (page had %d entries)", len(books), entries_this_page)
        if not entries_this_page:
            break
        url = next_url
    return books
 def delete_book(cfg: CalibreConfig, book_id: int, client: "CalibreClient | None" = None) -> tuple[bool, str]:
    """Delete a book from Calibre-Web by ID. Pass a pre-authenticated client to avoid re-auth overhead."""
    if client is None:
        client = CalibreClient(cfg)
        client._ensure_auth()
    csrf = client._upload_csrf
    if not csrf:
        # Try to fetch a CSRF token from the book detail page
        try:
            page = client._session.get(f"{cfg.url}/book/{book_id}", timeout=15)
            csrf = _extract_csrf(page.text)
            client._upload_csrf = csrf
        except Exception:
            pass
    for attempt in range(2):
        resp = client._session.post(
            f"{cfg.url}/delete/{book_id}",
            data={"csrf_token": csrf} if csrf else {},
            timeout=30,
        )
        if resp.ok:
            return True, "Deleted"
        if resp.status_code == 400 and attempt == 0:
            # CSRF token likely expired; re-authenticate and retry once
            log.info("delete_book: 400 on book %d — refreshing CSRF and retrying", book_id)
            client._authenticated = False
            client._upload_csrf = None
            client._ensure_auth()
            csrf = client._upload_csrf
            continue
        return False, f"HTTP {resp.status_code}"
    return False, "HTTP 400 after re-auth retry"
 def find_duplicate_groups(books: list[dict]) -> list[list[dict]]:
    """Group books by normalised title+author; return only groups with 2+ entries."""
    from collections import defaultdict
    groups: dict[str, list[dict]] = defaultdict(list)
    for book in books:
        title = re.sub(r"[^\w\s]", " ", book.get("title", "").lower())
        title = re.sub(r"\s+", " ", title).strip()
        authors = re.sub(r"[^\w\s]", " ", book.get("authors", "").lower())
        authors = re.sub(r"\s+", " ", authors).strip()
        key = f"{title}||{authors}"
        if title:
            groups[key].append(book)
    return sorted(
        [g for g in groups.values() if len(g) > 1],
        key=lambda g: g[0].get("title", "").lower(),
    )
 def test_connection(cfg: CalibreConfig) -> tuple[bool, str]:
    try:
        client = CalibreClient(cfg)
        client._ensure_auth()
        return True, f"Authenticated to {cfg.url} as '{cfg.user}'."
    except Exception as e:
        return False, str(e)
 # --- Helpers ---
 def _ascii_fold(s: str) -> str:
    """Strip accents: 'världens' → 'varldens', 'väg' → 'vag'."""
    return "".join(c for c in unicodedata.normalize("NFKD", s) if unicodedata.category(c) != "Mn")
 def _keywords_from_filename(filename: str) -> list[str]:
    """Extract meaningful words from a release-style filename for OPDS search."""
    stem = _ascii_fold(Path(filename).stem.lower())
    stem = re.sub(r"[._\-]", " ", stem)
    stem = re.sub(r"[^\w\s]", "", stem)
    words = stem.split()
    return [
        w for w in words
        if w not in _JUNK_WORDS
        and not re.match(r"^\d{4}$", w)
        and not re.match(r"^\d+$", w)
        and len(w) > 1
    ]
 def _normalize_words(title: str) -> list[str]:
    """Normalize a Calibre-Web title for comparison."""
    title = _ascii_fold(title.lower())
    title = re.sub(r"[^\w\s]", "", title)
    return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1]
 def _parse_opds_titles(xml: str) -> list[str]:
    """Extract book titles from an OPDS Atom feed, skipping the feed title itself."""
    # Grab all <title> elements; the first is the feed title ("Search results"), rest are books
    titles = re.findall(r"<title>([^<]+)</title>", xml)
    return titles[1:] if len(titles) > 1 else []
 def _extract_csrf(html: str) -> str | None:
    m = re.search(r'name="csrf_token"\s+value="([^"]+)"', html)
    if not m:
        m = re.search(r'value="([^"]+)"\s+name="csrf_token"', html)
    return m.group(1) if m else None
 def _sha256(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()