cwa import

This commit is contained in:
2026-05-13 18:24:55 +02:00
parent bfa09976b7
commit c0e1cb0688
11 changed files with 60 additions and 886 deletions
+6 -17
View File
@@ -14,18 +14,11 @@ class SFTPConfig:
remote_path: str = ""
@dataclass
class CalibreConfig:
url: str = ""
user: str = ""
password: str = ""
@dataclass
class AppConfig:
sftp: SFTPConfig = field(default_factory=SFTPConfig)
calibre: CalibreConfig = field(default_factory=CalibreConfig)
local_work_dir: str = "/tmp/calibresync"
work_dir: str = "/tmp/calibresync"
import_dir: str = ""
def load() -> AppConfig:
@@ -40,12 +33,8 @@ def load() -> AppConfig:
password=s.get("sftp_password", ""),
remote_path=s.get("sftp_remote_path", ""),
),
calibre=CalibreConfig(
url=s.get("calibre_url", "").rstrip("/"),
user=s.get("calibre_user", ""),
password=s.get("calibre_pass", ""),
),
local_work_dir=s.get("local_work_dir", "/tmp/calibresync"),
work_dir=s.get("work_dir", "/tmp/calibresync"),
import_dir=s.get("import_dir", ""),
)
@@ -53,8 +42,8 @@ def save(form: dict) -> None:
keys = [
"sftp_host", "sftp_port", "sftp_user", "sftp_auth_method",
"sftp_password", "sftp_remote_path",
"calibre_url", "calibre_user", "calibre_pass",
"local_work_dir", "scheduler_interval_minutes", "sync_batch_size",
"work_dir", "import_dir",
"scheduler_interval_minutes", "sync_batch_size",
]
for key in keys:
if key in form and form[key] is not None:
+14 -65
View File
@@ -47,26 +47,16 @@ def init_db() -> None:
error_msg TEXT
);
CREATE TABLE IF NOT EXISTS uploaded_books (
id INTEGER PRIMARY KEY,
filename TEXT NOT NULL,
file_hash TEXT UNIQUE NOT NULL,
zip_source TEXT,
uploaded_at TEXT,
status TEXT
);
CREATE TABLE IF NOT EXISTS sync_runs (
id INTEGER PRIMARY KEY,
started_at TEXT NOT NULL,
finished_at TEXT,
zips_found INTEGER DEFAULT 0,
zips_new INTEGER DEFAULT 0,
books_uploaded INTEGER DEFAULT 0,
books_skipped INTEGER DEFAULT 0,
id INTEGER PRIMARY KEY,
started_at TEXT NOT NULL,
finished_at TEXT,
zips_found INTEGER DEFAULT 0,
zips_new INTEGER DEFAULT 0,
books_imported INTEGER DEFAULT 0,
books_errored INTEGER DEFAULT 0,
status TEXT DEFAULT 'running',
error_msg TEXT
status TEXT DEFAULT 'running',
error_msg TEXT
);
CREATE TABLE IF NOT EXISTS remote_zip_cache (
@@ -171,40 +161,6 @@ def get_recent_zips(limit: int = 50) -> list[sqlite3.Row]:
).fetchall()
# --- Uploaded books ---
def is_book_uploaded(file_hash: str) -> bool:
with get_db() as conn:
row = conn.execute(
"SELECT id FROM uploaded_books WHERE file_hash = ? AND status IN ('uploaded', 'skipped_duplicate')",
(file_hash,),
).fetchone()
return row is not None
def record_book(filename: str, file_hash: str, zip_source: str, status: str) -> None:
with get_db() as conn:
conn.execute(
"""INSERT INTO uploaded_books (filename, file_hash, zip_source, uploaded_at, status)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(file_hash) DO UPDATE SET status = excluded.status""",
(filename, file_hash, zip_source, _now(), status),
)
def get_books(limit: int = 200, offset: int = 0) -> list[sqlite3.Row]:
with get_db() as conn:
return conn.execute(
"SELECT * FROM uploaded_books ORDER BY uploaded_at DESC LIMIT ? OFFSET ?",
(limit, offset),
).fetchall()
def get_books_count() -> int:
with get_db() as conn:
return conn.execute("SELECT COUNT(*) FROM uploaded_books").fetchone()[0]
# --- Sync runs ---
def start_sync_run() -> int:
@@ -233,35 +189,28 @@ def get_recent_runs(limit: int = 10) -> list[sqlite3.Row]:
def get_stats() -> dict:
with get_db() as conn:
total_books = conn.execute("SELECT COUNT(*) FROM uploaded_books").fetchone()[0]
uploaded = conn.execute(
"SELECT COUNT(*) FROM uploaded_books WHERE status = 'uploaded'"
).fetchone()[0]
skipped = conn.execute(
"SELECT COUNT(*) FROM uploaded_books WHERE status = 'skipped_duplicate'"
).fetchone()[0]
total_zips = conn.execute("SELECT COUNT(*) FROM processed_zips").fetchone()[0]
total_imported = conn.execute(
"SELECT COALESCE(SUM(books_imported), 0) FROM sync_runs"
).fetchone()[0]
last_run = conn.execute(
"SELECT started_at, status FROM sync_runs ORDER BY started_at DESC LIMIT 1"
).fetchone()
return {
"total_books": total_books,
"uploaded": uploaded,
"skipped": skipped,
"total_zips": total_zips,
"total_imported": total_imported,
"last_run": dict(last_run) if last_run else None,
}
def clear_sync_data() -> dict:
"""Delete all processed_zips, uploaded_books, and sync_runs rows. Settings are kept.
"""Delete all processed_zips and sync_runs rows. Settings are kept.
Also resets the remote scan timestamp so the next sync does a full rescan."""
with get_db() as conn:
zips = conn.execute("DELETE FROM processed_zips").rowcount
books = conn.execute("DELETE FROM uploaded_books").rowcount
runs = conn.execute("DELETE FROM sync_runs").rowcount
conn.execute("DELETE FROM settings WHERE key = 'remote_cache_last_scan'")
return {"zips": zips, "books": books, "runs": runs}
return {"zips": zips, "runs": runs}
def _now() -> str:
+2
View File
@@ -6,6 +6,8 @@ services:
volumes:
# Persists the SQLite database and settings across container restarts
- ./data:/app/data
# CWA import folder — set the host path to match your CWA ingest directory
- /path/to/cwa-import:/cwa-import
# Optional: mount your SSH private key read-only instead of pasting it in the UI
# - ~/.ssh/id_rsa:/run/secrets/ssh_key:ro
restart: unless-stopped
+4 -132
View File
@@ -12,8 +12,6 @@ import config
import db
import sftp as sftp_module
import sync
import uploader
from uploader import CalibreClient, delete_book, fetch_all_books, find_duplicate_groups
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s%(message)s")
log = logging.getLogger(__name__)
@@ -77,23 +75,6 @@ async def dashboard(request: Request):
})
# --- Books ---
@app.get("/books", response_class=HTMLResponse)
async def books_page(request: Request, page: int = 1):
per_page = 50
offset = (page - 1) * per_page
books = [dict(b) for b in db.get_books(limit=per_page, offset=offset)]
total = db.get_books_count()
pages = max(1, (total + per_page - 1) // per_page)
return templates.TemplateResponse(request, "books.html", {
"books": books,
"page": page,
"pages": pages,
"total": total,
})
# --- Settings ---
@app.get("/settings", response_class=HTMLResponse)
@@ -117,10 +98,8 @@ async def save_settings(
sftp_key: str = Form(""),
sftp_password: str = Form(""),
sftp_remote_path: str = Form(""),
calibre_url: str = Form(""),
calibre_user: str = Form(""),
calibre_pass: str = Form(""),
local_work_dir: str = Form("/tmp/calibresync"),
work_dir: str = Form("/tmp/calibresync"),
import_dir: str = Form(""),
scheduler_interval_minutes: str = Form("0"),
sync_batch_size: str = Form("0"),
):
@@ -132,10 +111,8 @@ async def save_settings(
"sftp_key": sftp_key,
"sftp_password": sftp_password,
"sftp_remote_path": sftp_remote_path,
"calibre_url": calibre_url,
"calibre_user": calibre_user,
"calibre_pass": calibre_pass,
"local_work_dir": local_work_dir,
"work_dir": work_dir,
"import_dir": import_dir,
"scheduler_interval_minutes": scheduler_interval_minutes,
"sync_batch_size": sync_batch_size,
})
@@ -179,111 +156,6 @@ async def test_ssh():
return {"ok": ok, "message": message}
@app.get("/api/test/calibre")
async def test_calibre():
cfg = config.load()
ok, message = uploader.test_connection(cfg.calibre)
return {"ok": ok, "message": message}
# --- Duplicates ---
@app.get("/duplicates", response_class=HTMLResponse)
async def duplicates_page(request: Request):
cfg = config.load()
error = None
groups: list = []
total_books = 0
try:
books = fetch_all_books(cfg.calibre)
total_books = len(books)
groups = find_duplicate_groups(books)
except Exception as e:
error = str(e)
return templates.TemplateResponse(request, "duplicates.html", {
"groups": groups,
"total_books": total_books,
"error": error,
})
@app.post("/api/delete_book/{book_id}")
async def delete_book_api(book_id: int):
cfg = config.load()
ok, message = delete_book(cfg.calibre, book_id)
return {"ok": ok, "message": message}
_dedup_state: dict = {"running": False, "deleted": 0, "failed": 0, "total": 0, "done": False, "error": None}
def _run_dedup():
global _dedup_state
try:
cfg = config.load()
log.info("Dedup: fetching all books ...")
client = CalibreClient(cfg.calibre)
client._ensure_auth()
books = fetch_all_books(cfg.calibre)
groups = find_duplicate_groups(books)
to_delete = [b for group in groups for b in sorted(group, key=lambda x: x.get("id", 0))[1:]]
_dedup_state.update({"total": len(to_delete), "deleted": 0, "failed": 0})
log.info("Dedup: %d duplicate(s) to delete across %d group(s)", len(to_delete), len(groups))
for book in to_delete:
ok, msg = delete_book(cfg.calibre, book["id"], client)
if ok:
_dedup_state["deleted"] += 1
else:
_dedup_state["failed"] += 1
log.warning("Dedup: failed to delete book %d: %s", book["id"], msg)
if _dedup_state["deleted"] % 10 == 0:
log.info("Dedup progress: %d / %d deleted", _dedup_state["deleted"], _dedup_state["total"])
log.info("Dedup done: %d deleted, %d failed", _dedup_state["deleted"], _dedup_state["failed"])
except Exception as e:
log.error("Dedup error: %s", e)
_dedup_state["error"] = str(e)
finally:
_dedup_state["running"] = False
_dedup_state["done"] = True
@app.post("/api/delete_duplicates")
async def delete_duplicates_api(background_tasks: BackgroundTasks):
if _dedup_state["running"]:
return {"ok": False, "message": "Already running"}
_dedup_state.update({"running": True, "deleted": 0, "failed": 0, "total": 0, "done": False, "error": None})
background_tasks.add_task(_run_dedup)
return {"ok": True, "message": "Started"}
@app.get("/api/delete_duplicates/status")
async def delete_duplicates_status():
return _dedup_state
@app.get("/api/debug/calibre_books")
async def debug_calibre_books():
"""Show raw Calibre-Web listbooks response shape so we can identify field names."""
cfg = config.load()
from uploader import CalibreClient
client = CalibreClient(cfg.calibre)
client._ensure_auth()
resp = client._session.get(
f"{cfg.calibre.url}/ajax/listbooks",
params={"draw": 1, "start": 0, "length": 5, "sort": "title", "order": "asc"},
timeout=30,
)
data = resp.json()
non_list = {k: v for k, v in data.items() if not isinstance(v, list)}
list_keys = {k: len(v) for k, v in data.items() if isinstance(v, list)}
return {
"http_status": resp.status_code,
"top_level_keys": list(data.keys()),
"non_list_fields": non_list,
"list_fields_lengths": list_keys,
}
# --- Data reset ---
@app.post("/settings/reset-sync-data")
-1
View File
@@ -4,5 +4,4 @@ jinja2
python-multipart
paramiko
rarfile
requests
apscheduler
+19 -44
View File
@@ -1,4 +1,5 @@
import logging
import shutil
import threading
import time
from pathlib import Path
@@ -7,7 +8,6 @@ import config
import db
import extractor
import sftp as sftp_module
from uploader import CalibreClient, CalibreUnavailableError
log = logging.getLogger(__name__)
@@ -33,22 +33,23 @@ def run_sync(limit: int | None = None) -> None:
_running = True
run_id = db.start_sync_run()
counters = dict(zips_found=0, zips_new=0, books_uploaded=0, books_skipped=0, books_errored=0)
counters = dict(zips_found=0, zips_new=0, books_imported=0, books_errored=0)
try:
log.info("Sync started (limit=%s)", limit)
cfg = config.load()
_validate_config(cfg)
log.info("Config OK — work dir: %s", cfg.local_work_dir)
log.info("Config OK — work dir: %s, import dir: %s", cfg.work_dir, cfg.import_dir)
work_dir = Path(cfg.local_work_dir)
work_dir = Path(cfg.work_dir)
work_dir.mkdir(parents=True, exist_ok=True)
log.info("Work dir ready: %s", work_dir)
import_dir = Path(cfg.import_dir)
import_dir.mkdir(parents=True, exist_ok=True)
log.info("Connecting to SFTP %s@%s:%s ...", cfg.sftp.user, cfg.sftp.host, cfg.sftp.port)
new_zips = sftp_module.list_new_zips(cfg.sftp, max_results=limit)
counters["zips_found"] = len(new_zips)
counters["zips_new"] = len(new_zips)
if not new_zips:
@@ -56,21 +57,11 @@ def run_sync(limit: int | None = None) -> None:
db.finish_sync_run(run_id, status="success", **counters)
return
# Determine chunk size; 0 means process everything in one chunk
batch_size = int(db.get_setting("sync_batch_size", "0") or "0")
if batch_size <= 0:
batch_size = len(new_zips)
total_batches = -(-len(new_zips) // batch_size) # ceiling division
client = CalibreClient(cfg.calibre)
# Pre-load existing book titles so duplicate detection doesn't need per-book OPDS searches
try:
from uploader import fetch_all_books
existing = fetch_all_books(cfg.calibre)
client.preload_existing_titles(existing)
except Exception as exc:
log.warning("Could not pre-load existing books (%s) — will fall back to per-book OPDS search", exc)
total_batches = -(-len(new_zips) // batch_size)
for batch_num, i in enumerate(range(0, len(new_zips), batch_size), start=1):
chunk = new_zips[i : i + batch_size]
@@ -89,34 +80,21 @@ def run_sync(limit: int | None = None) -> None:
books = extractor.extract(local_zip, work_dir / "extracted")
log.info("Extract done in %.1fs — %d book(s)", time.monotonic() - t1, len(books))
books_errored_this_zip = 0
for book in books:
t2 = time.monotonic()
status = client.upload(book, zip_source=remote_zip.remote_path)
log.info("Upload '%s' %s (%.1fs)", book.name, status, time.monotonic() - t2)
time.sleep(2)
if status == "uploaded":
counters["books_uploaded"] += 1
elif status == "skipped_duplicate":
counters["books_skipped"] += 1
dest = import_dir / book.name
if dest.exists():
log.info("Skipping '%s' — already exists in import dir", book.name)
else:
counters["books_errored"] += 1
books_errored_this_zip += 1
if books_errored_this_zip:
zip_status = "error"
zip_error = f"{books_errored_this_zip} book upload(s) failed — will retry next sync"
shutil.move(str(book), str(dest))
log.info("Moved '%s'%s", book.name, import_dir)
counters["books_imported"] += 1
extractor.cleanup(work_dir / "extracted" / local_zip.stem)
except CalibreUnavailableError as e:
log.error("Calibre-Web unavailable — aborting sync run: %s", e)
db.mark_zip_processed(remote_zip.remote_path, remote_zip.file_size, "error", str(e))
db.finish_sync_run(run_id, status="error", error_msg=str(e), **counters)
return
except Exception as e:
log.error("Error processing %s: %s", remote_zip.remote_path, e)
zip_status = "error"
zip_error = str(e)
counters["books_errored"] += 1
finally:
if local_zip and local_zip.exists():
extractor.cleanup(local_zip)
@@ -126,9 +104,8 @@ def run_sync(limit: int | None = None) -> None:
db.finish_sync_run(run_id, status="success", **counters)
log.info(
"Sync complete. Total zips: %d, Uploaded: %d, Skipped: %d, Errors: %d",
counters["zips_new"], counters["books_uploaded"],
counters["books_skipped"], counters["books_errored"],
"Sync complete. Total zips: %d, Imported: %d, Errors: %d",
counters["zips_new"], counters["books_imported"], counters["books_errored"],
)
except Exception as e:
log.exception("Sync run failed: %s", e)
@@ -150,9 +127,7 @@ def _validate_config(cfg) -> None:
missing.append("SSH private key")
if cfg.sftp.auth_method == "password" and not cfg.sftp.password:
missing.append("SSH password")
if not cfg.calibre.url:
missing.append("Calibre-Web URL")
if not cfg.calibre.user:
missing.append("Calibre-Web username")
if not cfg.import_dir:
missing.append("CWA import folder")
if missing:
raise ValueError(f"Missing configuration: {', '.join(missing)}")
-46
View File
@@ -1,46 +0,0 @@
{% extends "base.html" %}
{% block title %}Books — CalibreSync{% endblock %}
{% block content %}
<div class="page-header">
<h1>Books <span class="muted">({{ total }})</span></h1>
</div>
{% if books %}
<table>
<thead>
<tr>
<th>Filename</th>
<th>Status</th>
<th>Source zip</th>
<th>Uploaded</th>
</tr>
</thead>
<tbody>
{% for b in books %}
<tr>
<td>{{ b.filename }}</td>
<td><span class="badge badge-{{ b.status }}">{{ b.status }}</span></td>
<td class="mono small muted">{{ b.zip_source or "—" }}</td>
<td>{{ b.uploaded_at[:19].replace("T"," ") if b.uploaded_at else "—" }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% if pages > 1 %}
<div class="pagination">
{% if page > 1 %}
<a href="/books?page={{ page - 1 }}">&laquo; Prev</a>
{% endif %}
<span>Page {{ page }} of {{ pages }}</span>
{% if page < pages %}
<a href="/books?page={{ page + 1 }}">Next &raquo;</a>
{% endif %}
</div>
{% endif %}
{% else %}
<p class="muted">No books recorded yet.</p>
{% endif %}
{% endblock %}
-124
View File
@@ -1,124 +0,0 @@
{% extends "base.html" %}
{% block title %}Duplicates — CalibreSync{% endblock %}
{% block content %}
<div class="page-header">
<h1>Duplicate books in Calibre-Web</h1>
{% if groups %}
<div class="header-actions">
<button class="btn btn-danger" onclick="deleteAll(this)">Delete all duplicates (keep oldest)</button>
</div>
{% endif %}
</div>
<div id="dedup-progress" style="display:none" class="alert alert-success"></div>
{% if error %}
<div class="alert alert-warning">Could not fetch books from Calibre-Web: {{ error }}</div>
{% else %}
<p class="muted small" style="margin-bottom:1.5rem">
Scanned <strong>{{ total_books }}</strong> book(s) —
{% if groups %}
found <strong>{{ groups|length }}</strong> duplicate group(s) (same title + author).
The oldest copy (lowest ID) is kept when deleting all.
{% else %}
no duplicates found.
{% endif %}
</p>
{% for group in groups %}
<div class="form-section" style="margin-bottom:1rem">
<h3 style="margin-top:0">{{ group[0].title }}</h3>
<table>
<thead>
<tr>
<th>ID</th>
<th>Title</th>
<th>Authors</th>
<th>Format</th>
<th></th>
</tr>
</thead>
<tbody>
{% for book in group %}
<tr id="row-{{ book.id }}">
<td class="mono muted">{{ book.id }}</td>
<td>{{ book.title }}</td>
<td>{{ book.authors }}</td>
<td>{{ book.format or "—" }}</td>
<td>
<button class="btn btn-danger" style="padding:0.2rem 0.7rem;font-size:0.85rem"
onclick="deleteBook({{ book.id }}, this)">Delete</button>
<span id="status-{{ book.id }}" class="muted small" style="margin-left:0.5rem"></span>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endfor %}
{% endif %}
<script>
async function deleteBook(id, btn) {
if (!confirm('Delete book ID ' + id + ' from Calibre-Web?')) return;
btn.disabled = true;
btn.textContent = 'Deleting…';
const status = document.getElementById('status-' + id);
try {
const r = await fetch('/api/delete_book/' + id, {method: 'POST'});
const data = await r.json();
if (data.ok) {
document.getElementById('row-' + id).style.opacity = '0.35';
btn.textContent = 'Deleted';
status.textContent = '✓';
} else {
btn.disabled = false;
btn.textContent = 'Delete';
status.textContent = 'Failed: ' + data.message;
status.style.color = 'var(--error, #f87171)';
}
} catch (e) {
btn.disabled = false;
btn.textContent = 'Delete';
status.textContent = 'Error: ' + e;
}
}
async function deleteAll(btn) {
if (!confirm('Delete all duplicates from Calibre-Web, keeping the oldest copy of each title+author? This cannot be undone.')) return;
btn.disabled = true;
btn.textContent = 'Starting…';
const progress = document.getElementById('dedup-progress');
progress.style.display = '';
progress.textContent = 'Fetching book list from Calibre-Web…';
await fetch('/api/delete_duplicates', {method: 'POST'});
const poll = setInterval(async () => {
const r = await fetch('/api/delete_duplicates/status');
const s = await r.json();
if (s.error) {
clearInterval(poll);
progress.textContent = 'Error: ' + s.error;
progress.className = 'alert alert-warning';
btn.disabled = false;
btn.textContent = 'Delete all duplicates (keep oldest)';
return;
}
if (s.total > 0) {
progress.textContent = `Deleting… ${s.deleted} / ${s.total} deleted, ${s.failed} failed`;
} else {
progress.textContent = 'Scanning for duplicates…';
}
if (s.done) {
clearInterval(poll);
progress.textContent = `Done — ${s.deleted} book(s) deleted, ${s.failed} failed. Reload to refresh the list.`;
btn.textContent = 'Reload';
btn.disabled = false;
btn.onclick = () => location.reload();
}
}, 2000);
}
</script>
{% endblock %}
+4 -14
View File
@@ -59,16 +59,8 @@
<div class="stat-label">Zip archives processed</div>
</div>
<div class="stat-card">
<div class="stat-value">{{ stats.uploaded }}</div>
<div class="stat-label">Books uploaded</div>
</div>
<div class="stat-card">
<div class="stat-value">{{ stats.skipped }}</div>
<div class="stat-label">Duplicates skipped</div>
</div>
<div class="stat-card">
<div class="stat-value">{{ stats.total_books }}</div>
<div class="stat-label">Total book records</div>
<div class="stat-value">{{ stats.total_imported }}</div>
<div class="stat-label">Books imported</div>
</div>
</div>
@@ -81,8 +73,7 @@
<th>Finished</th>
<th>Status</th>
<th>New zips</th>
<th>Uploaded</th>
<th>Skipped</th>
<th>Imported</th>
<th>Errors</th>
</tr>
</thead>
@@ -93,8 +84,7 @@
<td>{{ r.finished_at[:19].replace("T"," ") if r.finished_at else "—" }}</td>
<td><span class="badge badge-{{ r.status }}">{{ r.status }}</span></td>
<td>{{ r.zips_new }}</td>
<td>{{ r.books_uploaded }}</td>
<td>{{ r.books_skipped }}</td>
<td>{{ r.books_imported }}</td>
<td>{{ r.books_errored }}</td>
</tr>
{% endfor %}
+11 -30
View File
@@ -84,39 +84,20 @@
</div>
</section>
<section class="form-section">
<h2>Calibre-Web</h2>
<div class="form-row">
<label for="calibre_url">URL</label>
<input id="calibre_url" name="calibre_url" type="url" placeholder="http://localhost:8083"
value="{{ s.get('calibre_url','') }}">
</div>
<div class="form-row">
<label for="calibre_user">Username</label>
<input id="calibre_user" name="calibre_user" type="text" value="{{ s.get('calibre_user','') }}">
</div>
<div class="form-row">
<label for="calibre_pass">Password</label>
<input id="calibre_pass" name="calibre_pass" type="password"
value="{{ s.get('calibre_pass','') }}">
</div>
<div class="form-row">
<button type="button" class="btn btn-secondary" onclick="testConn('calibre', this)">Test Calibre-Web connection</button>
<p id="test-calibre-result" class="test-result"></p>
</div>
</section>
<section class="form-section">
<h2>Local</h2>
<div class="form-row">
<label for="local_work_dir">Work directory</label>
<input id="local_work_dir" name="local_work_dir" type="text" placeholder="/tmp/calibresync"
value="{{ s.get('local_work_dir','/tmp/calibresync') }}">
<label for="import_dir">CWA import folder</label>
<input id="import_dir" name="import_dir" type="text" placeholder="/mnt/cwa-import"
value="{{ s.get('import_dir','') }}">
<p class="muted small">Folder watched by Calibre-Web-Automated. Extracted epub/pdf files are moved here flat.</p>
</div>
<div class="form-row">
<label for="work_dir">Temp work directory</label>
<input id="work_dir" name="work_dir" type="text" placeholder="/tmp/calibresync"
value="{{ s.get('work_dir','/tmp/calibresync') }}">
<p class="muted small">Temporary storage for downloaded zips and extracted files. Cleaned up after each run.</p>
</div>
</section>
@@ -182,7 +163,7 @@ async function testConn(type, btn) {
result.className = "test-result test-fail";
} finally {
btn.disabled = false;
btn.textContent = type === "ssh" ? "Test SSH connection" : "Test Calibre-Web connection";
btn.textContent = "Test SSH connection";
}
}
</script>
-413
View File
@@ -1,413 +0,0 @@
import hashlib
import logging
import re
import time
import unicodedata
from pathlib import Path
from urllib.parse import quote
import requests
import db
from config import CalibreConfig
log = logging.getLogger(__name__)
MIME_TYPES = {
".epub": "application/epub+zip",
".pdf": "application/pdf",
}
# Words stripped before comparing titles — release-group tags, language codes, format names, etc.
_JUNK_WORDS = {
"retail", "epub", "ebook", "pdf", "mobi", "azw3", "decipher",
"swedish", "english", "danish", "norwegian", "finnish", "german", "french",
"the", "a", "an", "och", "und", "les", "der", "die", "das",
}
class CalibreUnavailableError(RuntimeError):
"""Raised when Calibre-Web returns repeated 502/503/504 — sync run should abort."""
class CalibreClient:
def __init__(self, cfg: CalibreConfig):
self._cfg = cfg
self._session = requests.Session()
self._authenticated = False
self._upload_csrf: str | None = None
self._consecutive_failures = 0
# Pre-loaded title word-sets for fast duplicate detection (set by preload_existing_titles)
self._existing_title_sets: list[frozenset[str]] | None = None
def preload_existing_titles(self, books: list[dict]) -> None:
"""Build an in-memory index of normalised title keywords from a pre-fetched book list."""
self._existing_title_sets = [
frozenset(_normalize_words(b.get("title", "")))
for b in books
if b.get("title")
]
log.info("Pre-loaded %d existing book titles for duplicate detection", len(self._existing_title_sets))
def _ensure_auth(self) -> None:
if self._authenticated:
return
login_url = f"{self._cfg.url}/login"
page = self._session.get(login_url, timeout=30)
page.raise_for_status()
csrf = _extract_csrf(page.text)
data = {"username": self._cfg.user, "password": self._cfg.password}
if csrf:
data["csrf_token"] = csrf
resp = self._session.post(login_url, data=data, allow_redirects=True, timeout=30)
resp.raise_for_status()
if resp.url.rstrip("/").endswith("/login"):
raise RuntimeError("Calibre-Web authentication failed — check credentials")
self._authenticated = True
self._upload_csrf = _extract_csrf(resp.text) or csrf
log.info("Authenticated to Calibre-Web at %s", self._cfg.url)
def _exists_in_calibre(self, filename: str) -> bool:
"""Check whether a book already exists in Calibre-Web. Returns True if likely duplicate."""
keywords = _keywords_from_filename(filename)
if len(keywords) < 2:
return False
our_words = set(keywords)
# Fast path: check pre-loaded title index (available when sync pre-fetches all books)
if self._existing_title_sets is not None:
for their_words in self._existing_title_sets:
if not their_words:
continue
overlap = len(our_words & their_words)
# Match if: 3+ words in common, OR 60%+ of filename keywords match the title,
# OR 60%+ of the stored title's words appear in the filename keywords.
# The third condition catches short titles drowned out by filename noise.
if (overlap >= 3
or (overlap / len(our_words) >= 0.6)
or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
log.info("Duplicate (preloaded index): '%s'", filename)
return True
return False
# Slow path fallback: OPDS search (used when no index is available)
query = " ".join(keywords[:6])
try:
resp = self._session.get(
f"{self._cfg.url}/opds/search/{quote(query, safe='')}",
auth=(self._cfg.user, self._cfg.password),
timeout=15,
)
if resp.status_code == 404:
return False
calibre_titles = _parse_opds_titles(resp.text)
if not calibre_titles:
return False
for title in calibre_titles:
their_words = set(_normalize_words(title))
if not their_words:
continue
overlap = len(our_words & their_words)
if (overlap >= 3
or (overlap / len(our_words) >= 0.6)
or (len(their_words) >= 2 and overlap / len(their_words) >= 0.6)):
log.info("Duplicate (OPDS search): '%s'", filename)
return True
except Exception as e:
log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)
return False
def upload(self, book_path: Path, zip_source: str) -> str:
"""Upload a book file. Returns status: 'uploaded' | 'skipped_duplicate' | 'error'."""
file_hash = _sha256(book_path)
# Primary guard: hash already in our DB
if db.is_book_uploaded(file_hash):
log.info("Skipping (already uploaded): %s", book_path.name)
db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate")
return "skipped_duplicate"
try:
self._ensure_auth()
# Secondary guard: title search in Calibre-Web (catches pre-existing books)
if self._exists_in_calibre(book_path.name):
log.info("Skipping (exists in Calibre-Web): %s", book_path.name)
db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate")
return "skipped_duplicate"
mime = MIME_TYPES.get(book_path.suffix.lower(), "application/octet-stream")
for attempt in range(1, 4):
try:
with book_path.open("rb") as fh:
resp = self._session.post(
f"{self._cfg.url}/upload",
files={"btn-upload": (book_path.name, fh, mime)},
data={"csrf_token": self._upload_csrf} if self._upload_csrf else {},
timeout=120,
)
if not resp.ok:
log.error("Upload HTTP %s (attempt %d/3) — body: %s", resp.status_code, attempt, resp.text[:300])
resp.raise_for_status()
log.info("Uploaded: %s", book_path.name)
self._consecutive_failures = 0
db.record_book(book_path.name, file_hash, zip_source, "uploaded")
# Add to in-session index so a later zip with the same title is skipped
if self._existing_title_sets is not None:
kw = frozenset(_keywords_from_filename(book_path.name))
if kw:
self._existing_title_sets.append(kw)
return "uploaded"
except requests.HTTPError:
if resp.status_code in (502, 503, 504):
if attempt < 3:
log.warning("HTTP %s on attempt %d/3 — retrying in 180s ...", resp.status_code, attempt)
time.sleep(180)
continue
# All retries exhausted
self._consecutive_failures += 1
if self._consecutive_failures >= 3:
raise CalibreUnavailableError(
f"Calibre-Web returned {resp.status_code} on {self._consecutive_failures} "
"consecutive books — aborting sync run"
)
break
if resp.status_code == 400 and attempt == 1:
log.warning("HTTP 400 — CSRF token likely expired, re-authenticating ...")
self._authenticated = False
self._upload_csrf = None
self._ensure_auth()
continue
break
db.record_book(book_path.name, file_hash, zip_source, "error")
return "error"
except CalibreUnavailableError:
db.record_book(book_path.name, file_hash, zip_source, "error")
raise
except Exception as e:
log.error("Upload failed for %s: %s", book_path.name, e)
db.record_book(book_path.name, file_hash, zip_source, "error")
return "error"
def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
"""Fetch every book from Calibre-Web. Tries /ajax/listbooks first; falls back to OPDS if pagination is broken."""
client = CalibreClient(cfg)
client._ensure_auth()
all_books: list[dict] = []
seen_ids: set = set()
page_size = 1000
start = 0
reported_total = 0
while True:
resp = client._session.get(
f"{cfg.url}/ajax/listbooks",
params={
"draw": 1,
"start": start, "length": page_size,
"iDisplayStart": start, "iDisplayLength": page_size,
},
timeout=60,
)
resp.raise_for_status()
data = resp.json()
if start == 0:
non_list = {k: v for k, v in data.items() if not isinstance(v, list)}
log.info("listbooks page-0 meta fields: %s", non_list)
rows = data.get("rows") or data.get("data") or []
reported_total = (
data.get("recordsTotal") or data.get("total_count") or
data.get("total") or data.get("totalNotFiltered") or 0
)
new_in_page = 0
for b in rows:
bid = b.get("id")
if bid not in seen_ids:
seen_ids.add(bid)
all_books.append(b)
new_in_page += 1
log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), reported_total, new_in_page)
if not rows or new_in_page == 0 or len(all_books) >= reported_total:
break
start += len(rows)
# If we got far fewer books than reported, listbooks pagination is broken — use OPDS instead
if reported_total > 0 and len(all_books) < reported_total // 2:
log.warning(
"listbooks pagination broken (%d/%d books retrieved). Falling back to OPDS.",
len(all_books), reported_total,
)
return _fetch_all_books_opds(cfg)
return all_books
def _fetch_all_books_opds(cfg: CalibreConfig) -> list[dict]:
"""Fetch all books via OPDS catalog, following next-page links."""
import xml.etree.ElementTree as ET
books: list[dict] = []
seen_ids: set = set()
url: str | None = f"{cfg.url}/opds/new"
auth = (cfg.user, cfg.password)
session = requests.Session()
while url:
resp = session.get(url, auth=auth, timeout=30)
if not resp.ok:
log.warning("OPDS fetch failed HTTP %s%s", resp.status_code, url)
break
try:
root = ET.fromstring(resp.content)
except ET.ParseError as exc:
log.warning("OPDS XML parse error: %s", exc)
break
next_url: str | None = None
entries_this_page = 0
for elem in root:
local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
if local == "link" and elem.get("rel") == "next":
href = elem.get("href", "")
next_url = href if href.startswith("http") else f"{cfg.url}{href}"
elif local == "entry":
entries_this_page += 1
title = ""
author_parts: list[str] = []
book_id: int | None = None
for child in elem:
ctag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if ctag == "title":
title = child.text or ""
elif ctag == "author":
for gc in child:
if (gc.tag.split("}")[-1] if "}" in gc.tag else gc.tag) == "name":
author_parts.append(gc.text or "")
elif ctag == "link":
m = re.search(r"/download/(\d+)/", child.get("href", ""))
if m and book_id is None:
book_id = int(m.group(1))
if book_id and book_id not in seen_ids:
seen_ids.add(book_id)
books.append({"id": book_id, "title": title, "authors": " & ".join(author_parts)})
log.info("OPDS fetched: %d books total (page had %d entries)", len(books), entries_this_page)
if not entries_this_page:
break
url = next_url
return books
def delete_book(cfg: CalibreConfig, book_id: int, client: "CalibreClient | None" = None) -> tuple[bool, str]:
"""Delete a book from Calibre-Web by ID. Pass a pre-authenticated client to avoid re-auth overhead."""
if client is None:
client = CalibreClient(cfg)
client._ensure_auth()
csrf = client._upload_csrf
if not csrf:
# Try to fetch a CSRF token from the book detail page
try:
page = client._session.get(f"{cfg.url}/book/{book_id}", timeout=15)
csrf = _extract_csrf(page.text)
client._upload_csrf = csrf
except Exception:
pass
for attempt in range(2):
resp = client._session.post(
f"{cfg.url}/delete/{book_id}",
data={"csrf_token": csrf} if csrf else {},
timeout=30,
)
if resp.ok:
return True, "Deleted"
if resp.status_code == 400 and attempt == 0:
# CSRF token likely expired; re-authenticate and retry once
log.info("delete_book: 400 on book %d — refreshing CSRF and retrying", book_id)
client._authenticated = False
client._upload_csrf = None
client._ensure_auth()
csrf = client._upload_csrf
continue
return False, f"HTTP {resp.status_code}"
return False, "HTTP 400 after re-auth retry"
def find_duplicate_groups(books: list[dict]) -> list[list[dict]]:
"""Group books by normalised title+author; return only groups with 2+ entries."""
from collections import defaultdict
groups: dict[str, list[dict]] = defaultdict(list)
for book in books:
title = re.sub(r"[^\w\s]", " ", book.get("title", "").lower())
title = re.sub(r"\s+", " ", title).strip()
authors = re.sub(r"[^\w\s]", " ", book.get("authors", "").lower())
authors = re.sub(r"\s+", " ", authors).strip()
key = f"{title}||{authors}"
if title:
groups[key].append(book)
return sorted(
[g for g in groups.values() if len(g) > 1],
key=lambda g: g[0].get("title", "").lower(),
)
def test_connection(cfg: CalibreConfig) -> tuple[bool, str]:
try:
client = CalibreClient(cfg)
client._ensure_auth()
return True, f"Authenticated to {cfg.url} as '{cfg.user}'."
except Exception as e:
return False, str(e)
# --- Helpers ---
def _ascii_fold(s: str) -> str:
"""Strip accents: 'världens''varldens', 'väg''vag'."""
return "".join(c for c in unicodedata.normalize("NFKD", s) if unicodedata.category(c) != "Mn")
def _keywords_from_filename(filename: str) -> list[str]:
"""Extract meaningful words from a release-style filename for OPDS search."""
stem = _ascii_fold(Path(filename).stem.lower())
stem = re.sub(r"[._\-]", " ", stem)
stem = re.sub(r"[^\w\s]", "", stem)
words = stem.split()
return [
w for w in words
if w not in _JUNK_WORDS
and not re.match(r"^\d{4}$", w)
and not re.match(r"^\d+$", w)
and len(w) > 1
]
def _normalize_words(title: str) -> list[str]:
"""Normalize a Calibre-Web title for comparison."""
title = _ascii_fold(title.lower())
title = re.sub(r"[^\w\s]", "", title)
return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1]
def _parse_opds_titles(xml: str) -> list[str]:
"""Extract book titles from an OPDS Atom feed, skipping the feed title itself."""
# Grab all <title> elements; the first is the feed title ("Search results"), rest are books
titles = re.findall(r"<title>([^<]+)</title>", xml)
return titles[1:] if len(titles) > 1 else []
def _extract_csrf(html: str) -> str | None:
m = re.search(r'name="csrf_token"\s+value="([^"]+)"', html)
if not m:
m = re.search(r'value="([^"]+)"\s+name="csrf_token"', html)
return m.group(1) if m else None
def _sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()