check for doubles

2026-05-12 12:54:43 +02:00
parent 3cf3e07059
commit 3ee5b0c24d
1 changed files with 69 additions and 8 deletions
@@ -158,21 +158,20 @@ class CalibreClient:


 def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
-    """Fetch every book from Calibre-Web via /ajax/listbooks. Returns raw row dicts."""
+    """Fetch every book from Calibre-Web. Tries /ajax/listbooks first; falls back to OPDS if pagination is broken."""
    client = CalibreClient(cfg)
    client._ensure_auth()
    all_books: list[dict] = []
    seen_ids: set = set()
    page_size = 1000
    start = 0
+    reported_total = 0
    while True:
        resp = client._session.get(
            f"{cfg.url}/ajax/listbooks",
            params={
                "draw": 1,
-                # DataTables 1.10+ names
                "start": start, "length": page_size,
-                # DataTables 1.9.x names (older Calibre-Web)
                "iDisplayStart": start, "iDisplayLength": page_size,
            },
            timeout=60,
@@ -182,9 +181,8 @@ def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
        if start == 0:
            non_list = {k: v for k, v in data.items() if not isinstance(v, list)}
            log.info("listbooks page-0 meta fields: %s", non_list)
-        # Calibre-Web uses DataTables format: "data"/"recordsTotal", older versions use "rows"/"total_count"
        rows = data.get("rows") or data.get("data") or []
-        total = (
+        reported_total = (
            data.get("recordsTotal") or data.get("total_count") or
            data.get("total") or data.get("totalNotFiltered") or 0
        )
@@ -195,14 +193,77 @@ def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
                seen_ids.add(bid)
                all_books.append(b)
                new_in_page += 1
-        log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), total, new_in_page)
-        # Stop when: empty page, no new books (start is being ignored), or we've seen everything
-        if not rows or new_in_page == 0 or len(all_books) >= total:
+        log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), reported_total, new_in_page)
+        if not rows or new_in_page == 0 or len(all_books) >= reported_total:
            break
        start += len(rows)
+
+    # If we got far fewer books than reported, listbooks pagination is broken — use OPDS instead
+    if reported_total > 0 and len(all_books) < reported_total // 2:
+        log.warning(
+            "listbooks pagination broken (%d/%d books retrieved). Falling back to OPDS.",
+            len(all_books), reported_total,
+        )
+        return _fetch_all_books_opds(cfg)
    return all_books


+def _fetch_all_books_opds(cfg: CalibreConfig) -> list[dict]:
+    """Fetch all books via OPDS catalog, following next-page links."""
+    import xml.etree.ElementTree as ET
+    books: list[dict] = []
+    seen_ids: set = set()
+    url: str | None = f"{cfg.url}/opds/new"
+    auth = (cfg.user, cfg.password)
+    session = requests.Session()
+
+    while url:
+        resp = session.get(url, auth=auth, timeout=30)
+        if not resp.ok:
+            log.warning("OPDS fetch failed HTTP %s — %s", resp.status_code, url)
+            break
+        try:
+            root = ET.fromstring(resp.content)
+        except ET.ParseError as exc:
+            log.warning("OPDS XML parse error: %s", exc)
+            break
+
+        next_url: str | None = None
+        entries_this_page = 0
+        for elem in root:
+            local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
+            if local == "link" and elem.get("rel") == "next":
+                href = elem.get("href", "")
+                next_url = href if href.startswith("http") else f"{cfg.url}{href}"
+            elif local == "entry":
+                entries_this_page += 1
+                title = ""
+                author_parts: list[str] = []
+                book_id: int | None = None
+                for child in elem:
+                    ctag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
+                    if ctag == "title":
+                        title = child.text or ""
+                    elif ctag == "author":
+                        for gc in child:
+                            if (gc.tag.split("}")[-1] if "}" in gc.tag else gc.tag) == "name":
+                                author_parts.append(gc.text or "")
+                    elif ctag == "link":
+                        m = re.search(r"/download/(\d+)/", child.get("href", ""))
+                        if m and book_id is None:
+                            book_id = int(m.group(1))
+                if book_id and book_id not in seen_ids:
+                    seen_ids.add(book_id)
+                    books.append({"id": book_id, "title": title, "authors": " & ".join(author_parts)})
+
+        log.info("OPDS fetched: %d books total (page had %d entries)", len(books), entries_this_page)
+        if not entries_this_page:
+            break
+        url = next_url
+
+    return books
+
+
 def delete_book(cfg: CalibreConfig, book_id: int, client: "CalibreClient | None" = None) -> tuple[bool, str]:
    """Delete a book from Calibre-Web by ID. Pass a pre-authenticated client to avoid re-auth overhead."""
    if client is None: