From 3ee5b0c24d789c473933141e55126b66608480ae Mon Sep 17 00:00:00 2001 From: grymphen Date: Tue, 12 May 2026 12:54:43 +0200 Subject: [PATCH] check for doubles --- uploader.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/uploader.py b/uploader.py index 2583611..6efdaf5 100644 --- a/uploader.py +++ b/uploader.py @@ -158,21 +158,20 @@ class CalibreClient: def fetch_all_books(cfg: CalibreConfig) -> list[dict]: - """Fetch every book from Calibre-Web via /ajax/listbooks. Returns raw row dicts.""" + """Fetch every book from Calibre-Web. Tries /ajax/listbooks first; falls back to OPDS if pagination is broken.""" client = CalibreClient(cfg) client._ensure_auth() all_books: list[dict] = [] seen_ids: set = set() page_size = 1000 start = 0 + reported_total = 0 while True: resp = client._session.get( f"{cfg.url}/ajax/listbooks", params={ "draw": 1, - # DataTables 1.10+ names "start": start, "length": page_size, - # DataTables 1.9.x names (older Calibre-Web) "iDisplayStart": start, "iDisplayLength": page_size, }, timeout=60, @@ -182,9 +181,8 @@ def fetch_all_books(cfg: CalibreConfig) -> list[dict]: if start == 0: non_list = {k: v for k, v in data.items() if not isinstance(v, list)} log.info("listbooks page-0 meta fields: %s", non_list) - # Calibre-Web uses DataTables format: "data"/"recordsTotal", older versions use "rows"/"total_count" rows = data.get("rows") or data.get("data") or [] - total = ( + reported_total = ( data.get("recordsTotal") or data.get("total_count") or data.get("total") or data.get("totalNotFiltered") or 0 ) @@ -195,14 +193,77 @@ def fetch_all_books(cfg: CalibreConfig) -> list[dict]: seen_ids.add(bid) all_books.append(b) new_in_page += 1 - log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), total, new_in_page) - # Stop when: empty page, no new books (start is being ignored), or we've seen everything - if not rows or new_in_page == 0 or len(all_books) >= total: + log.info("Books fetched: %d / %d (page gave %d new)", len(all_books), reported_total, new_in_page) + if not rows or new_in_page == 0 or len(all_books) >= reported_total: break start += len(rows) + + # If we got far fewer books than reported, listbooks pagination is broken — use OPDS instead + if reported_total > 0 and len(all_books) < reported_total // 2: + log.warning( + "listbooks pagination broken (%d/%d books retrieved). Falling back to OPDS.", + len(all_books), reported_total, + ) + return _fetch_all_books_opds(cfg) return all_books +def _fetch_all_books_opds(cfg: CalibreConfig) -> list[dict]: + """Fetch all books via OPDS catalog, following next-page links.""" + import xml.etree.ElementTree as ET + books: list[dict] = [] + seen_ids: set = set() + url: str | None = f"{cfg.url}/opds/new" + auth = (cfg.user, cfg.password) + session = requests.Session() + + while url: + resp = session.get(url, auth=auth, timeout=30) + if not resp.ok: + log.warning("OPDS fetch failed HTTP %s — %s", resp.status_code, url) + break + try: + root = ET.fromstring(resp.content) + except ET.ParseError as exc: + log.warning("OPDS XML parse error: %s", exc) + break + + next_url: str | None = None + entries_this_page = 0 + for elem in root: + local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + if local == "link" and elem.get("rel") == "next": + href = elem.get("href", "") + next_url = href if href.startswith("http") else f"{cfg.url}{href}" + elif local == "entry": + entries_this_page += 1 + title = "" + author_parts: list[str] = [] + book_id: int | None = None + for child in elem: + ctag = child.tag.split("}")[-1] if "}" in child.tag else child.tag + if ctag == "title": + title = child.text or "" + elif ctag == "author": + for gc in child: + if (gc.tag.split("}")[-1] if "}" in gc.tag else gc.tag) == "name": + author_parts.append(gc.text or "") + elif ctag == "link": + m = re.search(r"/download/(\d+)/", child.get("href", "")) + if m and book_id is None: + book_id = int(m.group(1)) + if book_id and book_id not in seen_ids: + seen_ids.add(book_id) + books.append({"id": book_id, "title": title, "authors": " & ".join(author_parts)}) + + log.info("OPDS fetched: %d books total (page had %d entries)", len(books), entries_this_page) + if not entries_this_page: + break + url = next_url + + return books + + def delete_book(cfg: CalibreConfig, book_id: int, client: "CalibreClient | None" = None) -> tuple[bool, str]: """Delete a book from Calibre-Web by ID. Pass a pre-authenticated client to avoid re-auth overhead.""" if client is None: