check for doubles

This commit is contained in:
2026-05-10 23:02:59 +02:00
parent 2034bd5f2b
commit 09afe40f04
3 changed files with 97 additions and 8 deletions
+45
View File
@@ -214,6 +214,51 @@ async def delete_book_api(book_id: int):
return {"ok": ok, "message": message}
_dedup_state: dict = {"running": False, "deleted": 0, "failed": 0, "total": 0, "done": False, "error": None}
def _run_dedup():
global _dedup_state
try:
cfg = config.load()
log.info("Dedup: fetching all books ...")
books = fetch_all_books(cfg.calibre)
groups = find_duplicate_groups(books)
to_delete = [b for group in groups for b in sorted(group, key=lambda x: x.get("id", 0))[1:]]
_dedup_state.update({"total": len(to_delete), "deleted": 0, "failed": 0})
log.info("Dedup: %d duplicate(s) to delete across %d group(s)", len(to_delete), len(groups))
for book in to_delete:
ok, msg = delete_book(cfg.calibre, book["id"])
if ok:
_dedup_state["deleted"] += 1
else:
_dedup_state["failed"] += 1
log.warning("Dedup: failed to delete book %d: %s", book["id"], msg)
if _dedup_state["deleted"] % 10 == 0:
log.info("Dedup progress: %d / %d deleted", _dedup_state["deleted"], _dedup_state["total"])
log.info("Dedup done: %d deleted, %d failed", _dedup_state["deleted"], _dedup_state["failed"])
except Exception as e:
log.error("Dedup error: %s", e)
_dedup_state["error"] = str(e)
finally:
_dedup_state["running"] = False
_dedup_state["done"] = True
@app.post("/api/delete_duplicates")
async def delete_duplicates_api(background_tasks: BackgroundTasks):
if _dedup_state["running"]:
return {"ok": False, "message": "Already running"}
_dedup_state.update({"running": True, "deleted": 0, "failed": 0, "total": 0, "done": False, "error": None})
background_tasks.add_task(_run_dedup)
return {"ok": True, "message": "Started"}
@app.get("/api/delete_duplicates/status")
async def delete_duplicates_status():
return _dedup_state
@app.get("/api/debug/calibre_books")
async def debug_calibre_books():
"""Show raw Calibre-Web listbooks response shape so we can identify field names."""
+45 -2
View File
@@ -4,7 +4,14 @@
{% block content %}
<div class="page-header">
<h1>Duplicate books in Calibre-Web</h1>
{% if groups %}
<div class="header-actions">
<button class="btn btn-danger" onclick="deleteAll(this)">Delete all duplicates (keep oldest)</button>
</div>
{% endif %}
</div>
<div id="dedup-progress" style="display:none" class="alert alert-success"></div>
{% if error %}
<div class="alert alert-warning">Could not fetch books from Calibre-Web: {{ error }}</div>
@@ -12,8 +19,8 @@
<p class="muted small" style="margin-bottom:1.5rem">
Scanned <strong>{{ total_books }}</strong> book(s) —
{% if groups %}
found <strong>{{ groups|length }}</strong> duplicate group(s).
Books are grouped by normalised title. Keep the one you want and delete the rest.
found <strong>{{ groups|length }}</strong> duplicate group(s) (same title + author).
The oldest copy (lowest ID) is kept when deleting all.
{% else %}
no duplicates found.
{% endif %}
@@ -77,5 +84,41 @@ async function deleteBook(id, btn) {
status.textContent = 'Error: ' + e;
}
}
async function deleteAll(btn) {
if (!confirm('Delete all duplicates from Calibre-Web, keeping the oldest copy of each title+author? This cannot be undone.')) return;
btn.disabled = true;
btn.textContent = 'Starting…';
const progress = document.getElementById('dedup-progress');
progress.style.display = '';
progress.textContent = 'Fetching book list from Calibre-Web…';
await fetch('/api/delete_duplicates', {method: 'POST'});
const poll = setInterval(async () => {
const r = await fetch('/api/delete_duplicates/status');
const s = await r.json();
if (s.error) {
clearInterval(poll);
progress.textContent = 'Error: ' + s.error;
progress.className = 'alert alert-warning';
btn.disabled = false;
btn.textContent = 'Delete all duplicates (keep oldest)';
return;
}
if (s.total > 0) {
progress.textContent = `Deleting… ${s.deleted} / ${s.total} deleted, ${s.failed} failed`;
} else {
progress.textContent = 'Scanning for duplicates…';
}
if (s.done) {
clearInterval(poll);
progress.textContent = `Done — ${s.deleted} book(s) deleted, ${s.failed} failed. Reload to refresh the list.`;
btn.textContent = 'Reload';
btn.disabled = false;
btn.onclick = () => location.reload();
}
}, 2000);
}
</script>
{% endblock %}
+7 -6
View File
@@ -198,15 +198,16 @@ def delete_book(cfg: CalibreConfig, book_id: int) -> tuple[bool, str]:
def find_duplicate_groups(books: list[dict]) -> list[list[dict]]:
"""Group books by normalised title; return only groups with 2+ entries."""
"""Group books by normalised title+author; return only groups with 2+ entries."""
from collections import defaultdict
groups: dict[str, list[dict]] = defaultdict(list)
for book in books:
title = book.get("title", "")
# Normalise: lowercase, strip punctuation and extra whitespace — no word removal
key = re.sub(r"[^\w\s]", " ", title.lower())
key = re.sub(r"\s+", " ", key).strip()
if key:
title = re.sub(r"[^\w\s]", " ", book.get("title", "").lower())
title = re.sub(r"\s+", " ", title).strip()
authors = re.sub(r"[^\w\s]", " ", book.get("authors", "").lower())
authors = re.sub(r"\s+", " ", authors).strip()
key = f"{title}||{authors}"
if title:
groups[key].append(book)
return sorted(
[g for g in groups.values() if len(g) > 1],