check for doubles

2026-05-12 12:54:43 +02:00
parent 3cf3e07059
commit 3ee5b0c24d
1 changed files with 69 additions and 8 deletions
@@ -158,21 +158,20 @@ class CalibreClient:
 def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
-    """Fetch every book from Calibre-Web via /ajax/listbooks. Returns raw row dicts."""
+    """Fetch every book from Calibre-Web. Tries /ajax/listbooks first; falls back to OPDS if pagination is broken."""
    client = CalibreClient(cfg)
    client._ensure_auth()
    all_books: list[dict] = []
    seen_ids: set = set()
    page_size = 1000
    start = 0
    reported_total = 0
    while True:
        resp = client._session.get(
            f"{cfg.url}/ajax/listbooks",
            params={
                "draw": 1,
                # DataTables 1.10+ names
                "start": start, "length": page_size,
                # DataTables 1.9.x names (older Calibre-Web)
                "iDisplayStart": start, "iDisplayLength": page_size,
            },
            timeout=60,
@@ -182,9 +181,8 @@ def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
        if start == 0:
            non_list = {k: v for k, v in data.items() if not isinstance(v, list)}
            log.info("listbooks page-0 meta fields: %s", non_list)
        # Calibre-Web uses DataTables format: "data"/"recordsTotal", older versions use "rows"/"total_count"
        rows = data.get("rows") or data.get("data") or []
-        total = (
+        reported_total = (
            data.get("recordsTotal") or data.get("total_count") or
            data.get("total") or data.get("totalNotFiltered") or 0
        )
@@ -195,14 +193,77 @@ def fetch_all_books(cfg: CalibreConfig) -> list[dict]:
                seen_ids.add(bid)
                all_books.append(b)
                new_in_page += 1
-        log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), total, new_in_page)
+        log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), reported_total, new_in_page)
-        # Stop when: empty page, no new books (start is being ignored), or we've seen everything
+        if not rows or new_in_page == 0 or len(all_books) >= reported_total:
        if not rows or new_in_page == 0 or len(all_books) >= total:
            break
        start += len(rows)
    # If we got far fewer books than reported, listbooks pagination is broken — use OPDS instead
    if reported_total > 0 and len(all_books) < reported_total // 2:
        log.warning(
            "listbooks pagination broken (%d/%d books retrieved). Falling back to OPDS.",
            len(all_books), reported_total,
        )
        return _fetch_all_books_opds(cfg)
    return all_books
 def _fetch_all_books_opds(cfg: CalibreConfig) -> list[dict]:
    """Fetch all books via OPDS catalog, following next-page links."""
    import xml.etree.ElementTree as ET
    books: list[dict] = []
    seen_ids: set = set()
    url: str | None = f"{cfg.url}/opds/new"
    auth = (cfg.user, cfg.password)
    session = requests.Session()
    while url:
        resp = session.get(url, auth=auth, timeout=30)
        if not resp.ok:
            log.warning("OPDS fetch failed HTTP %s — %s", resp.status_code, url)
            break
        try:
            root = ET.fromstring(resp.content)
        except ET.ParseError as exc:
            log.warning("OPDS XML parse error: %s", exc)
            break
        next_url: str | None = None
        entries_this_page = 0
        for elem in root:
            local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
            if local == "link" and elem.get("rel") == "next":
                href = elem.get("href", "")
                next_url = href if href.startswith("http") else f"{cfg.url}{href}"
            elif local == "entry":
                entries_this_page += 1
                title = ""
                author_parts: list[str] = []
                book_id: int | None = None
                for child in elem:
                    ctag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
                    if ctag == "title":
                        title = child.text or ""
                    elif ctag == "author":
                        for gc in child:
                            if (gc.tag.split("}")[-1] if "}" in gc.tag else gc.tag) == "name":
                                author_parts.append(gc.text or "")
                    elif ctag == "link":
                        m = re.search(r"/download/(\d+)/", child.get("href", ""))
                        if m and book_id is None:
                            book_id = int(m.group(1))
                if book_id and book_id not in seen_ids:
                    seen_ids.add(book_id)
                    books.append({"id": book_id, "title": title, "authors": " & ".join(author_parts)})
        log.info("OPDS fetched: %d books total (page had %d entries)", len(books), entries_this_page)
        if not entries_this_page:
            break
        url = next_url
    return books
 def delete_book(cfg: CalibreConfig, book_id: int, client: "CalibreClient | None" = None) -> tuple[bool, str]:
    """Delete a book from Calibre-Web by ID. Pass a pre-authenticated client to avoid re-auth overhead."""
    if client is None: