check for doubles

This commit is contained in:
2026-05-12 12:54:43 +02:00
parent 3cf3e07059
commit 3ee5b0c24d
+69 -8
View File
@@ -158,21 +158,20 @@ class CalibreClient:
def fetch_all_books(cfg: CalibreConfig) -> list[dict]: def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
"""Fetch every book from Calibre-Web via /ajax/listbooks. Returns raw row dicts.""" """Fetch every book from Calibre-Web. Tries /ajax/listbooks first; falls back to OPDS if pagination is broken."""
client = CalibreClient(cfg) client = CalibreClient(cfg)
client._ensure_auth() client._ensure_auth()
all_books: list[dict] = [] all_books: list[dict] = []
seen_ids: set = set() seen_ids: set = set()
page_size = 1000 page_size = 1000
start = 0 start = 0
reported_total = 0
while True: while True:
resp = client._session.get( resp = client._session.get(
f"{cfg.url}/ajax/listbooks", f"{cfg.url}/ajax/listbooks",
params={ params={
"draw": 1, "draw": 1,
# DataTables 1.10+ names
"start": start, "length": page_size, "start": start, "length": page_size,
# DataTables 1.9.x names (older Calibre-Web)
"iDisplayStart": start, "iDisplayLength": page_size, "iDisplayStart": start, "iDisplayLength": page_size,
}, },
timeout=60, timeout=60,
@@ -182,9 +181,8 @@ def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
if start == 0: if start == 0:
non_list = {k: v for k, v in data.items() if not isinstance(v, list)} non_list = {k: v for k, v in data.items() if not isinstance(v, list)}
log.info("listbooks page-0 meta fields: %s", non_list) log.info("listbooks page-0 meta fields: %s", non_list)
# Calibre-Web uses DataTables format: "data"/"recordsTotal", older versions use "rows"/"total_count"
rows = data.get("rows") or data.get("data") or [] rows = data.get("rows") or data.get("data") or []
total = ( reported_total = (
data.get("recordsTotal") or data.get("total_count") or data.get("recordsTotal") or data.get("total_count") or
data.get("total") or data.get("totalNotFiltered") or 0 data.get("total") or data.get("totalNotFiltered") or 0
) )
@@ -195,14 +193,77 @@ def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
seen_ids.add(bid) seen_ids.add(bid)
all_books.append(b) all_books.append(b)
new_in_page += 1 new_in_page += 1
log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), total, new_in_page) log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), reported_total, new_in_page)
# Stop when: empty page, no new books (start is being ignored), or we've seen everything if not rows or new_in_page == 0 or len(all_books) >= reported_total:
if not rows or new_in_page == 0 or len(all_books) >= total:
break break
start += len(rows) start += len(rows)
# If we got far fewer books than reported, listbooks pagination is broken — use OPDS instead
if reported_total > 0 and len(all_books) < reported_total // 2:
log.warning(
"listbooks pagination broken (%d/%d books retrieved). Falling back to OPDS.",
len(all_books), reported_total,
)
return _fetch_all_books_opds(cfg)
return all_books return all_books
def _fetch_all_books_opds(cfg: CalibreConfig) -> list[dict]:
"""Fetch all books via OPDS catalog, following next-page links."""
import xml.etree.ElementTree as ET
books: list[dict] = []
seen_ids: set = set()
url: str | None = f"{cfg.url}/opds/new"
auth = (cfg.user, cfg.password)
session = requests.Session()
while url:
resp = session.get(url, auth=auth, timeout=30)
if not resp.ok:
log.warning("OPDS fetch failed HTTP %s%s", resp.status_code, url)
break
try:
root = ET.fromstring(resp.content)
except ET.ParseError as exc:
log.warning("OPDS XML parse error: %s", exc)
break
next_url: str | None = None
entries_this_page = 0
for elem in root:
local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
if local == "link" and elem.get("rel") == "next":
href = elem.get("href", "")
next_url = href if href.startswith("http") else f"{cfg.url}{href}"
elif local == "entry":
entries_this_page += 1
title = ""
author_parts: list[str] = []
book_id: int | None = None
for child in elem:
ctag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if ctag == "title":
title = child.text or ""
elif ctag == "author":
for gc in child:
if (gc.tag.split("}")[-1] if "}" in gc.tag else gc.tag) == "name":
author_parts.append(gc.text or "")
elif ctag == "link":
m = re.search(r"/download/(\d+)/", child.get("href", ""))
if m and book_id is None:
book_id = int(m.group(1))
if book_id and book_id not in seen_ids:
seen_ids.add(book_id)
books.append({"id": book_id, "title": title, "authors": " & ".join(author_parts)})
log.info("OPDS fetched: %d books total (page had %d entries)", len(books), entries_this_page)
if not entries_this_page:
break
url = next_url
return books
def delete_book(cfg: CalibreConfig, book_id: int, client: "CalibreClient | None" = None) -> tuple[bool, str]: def delete_book(cfg: CalibreConfig, book_id: int, client: "CalibreClient | None" = None) -> tuple[bool, str]:
"""Delete a book from Calibre-Web by ID. Pass a pre-authenticated client to avoid re-auth overhead.""" """Delete a book from Calibre-Web by ID. Pass a pre-authenticated client to avoid re-auth overhead."""
if client is None: if client is None: