calibre upload errors
This commit is contained in:
@@ -14,8 +14,7 @@ import sftp as sftp_module
|
|||||||
import sync
|
import sync
|
||||||
import uploader
|
import uploader
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(name)s — %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s — %(message)s")
|
||||||
logging.getLogger("paramiko").setLevel(logging.INFO) # paramiko debug is too noisy
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
_scheduler = BackgroundScheduler(timezone="UTC")
|
_scheduler = BackgroundScheduler(timezone="UTC")
|
||||||
|
|||||||
+79
-14
@@ -2,6 +2,7 @@ import hashlib
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@@ -15,6 +16,13 @@ MIME_TYPES = {
|
|||||||
".pdf": "application/pdf",
|
".pdf": "application/pdf",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Words stripped before comparing titles — release-group tags, language codes, format names, etc.
|
||||||
|
_JUNK_WORDS = {
|
||||||
|
"retail", "epub", "ebook", "pdf", "mobi", "azw3", "decipher",
|
||||||
|
"swedish", "english", "danish", "norwegian", "finnish", "german", "french",
|
||||||
|
"the", "a", "an", "och", "und", "les", "der", "die", "das",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class CalibreClient:
|
class CalibreClient:
|
||||||
def __init__(self, cfg: CalibreConfig):
|
def __init__(self, cfg: CalibreConfig):
|
||||||
@@ -26,7 +34,6 @@ class CalibreClient:
|
|||||||
def _ensure_auth(self) -> None:
|
def _ensure_auth(self) -> None:
|
||||||
if self._authenticated:
|
if self._authenticated:
|
||||||
return
|
return
|
||||||
# Fetch login page first to get the CSRF token (Flask-WTF requirement)
|
|
||||||
login_url = f"{self._cfg.url}/login"
|
login_url = f"{self._cfg.url}/login"
|
||||||
page = self._session.get(login_url, timeout=30)
|
page = self._session.get(login_url, timeout=30)
|
||||||
page.raise_for_status()
|
page.raise_for_status()
|
||||||
@@ -36,36 +43,63 @@ class CalibreClient:
|
|||||||
if csrf:
|
if csrf:
|
||||||
data["csrf_token"] = csrf
|
data["csrf_token"] = csrf
|
||||||
|
|
||||||
resp = self._session.post(
|
resp = self._session.post(login_url, data=data, allow_redirects=True, timeout=30)
|
||||||
login_url,
|
|
||||||
data=data,
|
|
||||||
allow_redirects=True,
|
|
||||||
timeout=30,
|
|
||||||
)
|
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
# Calibre-Web redirects to / on success; landing back on /login means bad creds
|
|
||||||
if resp.url.rstrip("/").endswith("/login"):
|
if resp.url.rstrip("/").endswith("/login"):
|
||||||
raise RuntimeError("Calibre-Web authentication failed — check credentials")
|
raise RuntimeError("Calibre-Web authentication failed — check credentials")
|
||||||
self._authenticated = True
|
self._authenticated = True
|
||||||
# The CSRF token is session-scoped in Flask-WTF — reuse the login token for uploads.
|
|
||||||
# Also try to extract a fresh one from the landing page (/).
|
|
||||||
self._upload_csrf = _extract_csrf(resp.text) or csrf
|
self._upload_csrf = _extract_csrf(resp.text) or csrf
|
||||||
log.debug("Upload CSRF token from login: %s",
|
|
||||||
self._upload_csrf[:12] + "…" if self._upload_csrf else "NOT FOUND")
|
|
||||||
log.info("Authenticated to Calibre-Web at %s", self._cfg.url)
|
log.info("Authenticated to Calibre-Web at %s", self._cfg.url)
|
||||||
|
|
||||||
|
def _exists_in_calibre(self, filename: str) -> bool:
|
||||||
|
"""Search Calibre-Web OPDS for a title matching this filename. Returns True if likely duplicate."""
|
||||||
|
keywords = _keywords_from_filename(filename)
|
||||||
|
if len(keywords) < 2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
query = " ".join(keywords[:6])
|
||||||
|
try:
|
||||||
|
resp = self._session.get(
|
||||||
|
f"{self._cfg.url}/opds/search/{quote(query, safe='')}",
|
||||||
|
timeout=15,
|
||||||
|
)
|
||||||
|
if resp.status_code == 404:
|
||||||
|
return False
|
||||||
|
calibre_titles = _parse_opds_titles(resp.text)
|
||||||
|
if not calibre_titles:
|
||||||
|
return False
|
||||||
|
|
||||||
|
our_words = set(keywords)
|
||||||
|
for title in calibre_titles:
|
||||||
|
their_words = set(_normalize_words(title))
|
||||||
|
overlap = len(our_words & their_words)
|
||||||
|
# Match if 3+ words in common, or 60%+ of our keywords match
|
||||||
|
if overlap >= 3 or (our_words and overlap / len(our_words) >= 0.6):
|
||||||
|
log.info("Duplicate found in Calibre-Web: '%s' ~ '%s'", filename, title)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("OPDS search failed for '%s': %s — proceeding with upload", filename, e)
|
||||||
|
return False
|
||||||
|
|
||||||
def upload(self, book_path: Path, zip_source: str) -> str:
|
def upload(self, book_path: Path, zip_source: str) -> str:
|
||||||
"""Upload a book file. Returns status: 'uploaded' | 'skipped_duplicate' | 'error'."""
|
"""Upload a book file. Returns status: 'uploaded' | 'skipped_duplicate' | 'error'."""
|
||||||
file_hash = _sha256(book_path)
|
file_hash = _sha256(book_path)
|
||||||
|
|
||||||
|
# Primary guard: hash already in our DB
|
||||||
if db.is_book_uploaded(file_hash):
|
if db.is_book_uploaded(file_hash):
|
||||||
log.info("Skipping duplicate: %s (hash %s)", book_path.name, file_hash[:8])
|
log.info("Skipping (already uploaded): %s", book_path.name)
|
||||||
db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate")
|
db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate")
|
||||||
return "skipped_duplicate"
|
return "skipped_duplicate"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._ensure_auth()
|
self._ensure_auth()
|
||||||
|
|
||||||
|
# Secondary guard: title search in Calibre-Web (catches pre-existing books)
|
||||||
|
if self._exists_in_calibre(book_path.name):
|
||||||
|
log.info("Skipping (exists in Calibre-Web): %s", book_path.name)
|
||||||
|
db.record_book(book_path.name, file_hash, zip_source, "skipped_duplicate")
|
||||||
|
return "skipped_duplicate"
|
||||||
|
|
||||||
mime = MIME_TYPES.get(book_path.suffix.lower(), "application/octet-stream")
|
mime = MIME_TYPES.get(book_path.suffix.lower(), "application/octet-stream")
|
||||||
with book_path.open("rb") as fh:
|
with book_path.open("rb") as fh:
|
||||||
resp = self._session.post(
|
resp = self._session.post(
|
||||||
@@ -75,7 +109,7 @@ class CalibreClient:
|
|||||||
timeout=120,
|
timeout=120,
|
||||||
)
|
)
|
||||||
if not resp.ok:
|
if not resp.ok:
|
||||||
log.error("Upload HTTP %s — response body: %s", resp.status_code, resp.text[:500])
|
log.error("Upload HTTP %s — body: %s", resp.status_code, resp.text[:300])
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
log.info("Uploaded: %s", book_path.name)
|
log.info("Uploaded: %s", book_path.name)
|
||||||
db.record_book(book_path.name, file_hash, zip_source, "uploaded")
|
db.record_book(book_path.name, file_hash, zip_source, "uploaded")
|
||||||
@@ -98,6 +132,37 @@ def test_connection(cfg: CalibreConfig) -> tuple[bool, str]:
|
|||||||
return False, str(e)
|
return False, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
# --- Helpers ---
|
||||||
|
|
||||||
|
def _keywords_from_filename(filename: str) -> list[str]:
|
||||||
|
"""Extract meaningful words from a release-style filename for OPDS search."""
|
||||||
|
stem = Path(filename).stem.lower()
|
||||||
|
stem = re.sub(r"[._\-]", " ", stem)
|
||||||
|
stem = re.sub(r"[^\w\s]", "", stem)
|
||||||
|
words = stem.split()
|
||||||
|
return [
|
||||||
|
w for w in words
|
||||||
|
if w not in _JUNK_WORDS
|
||||||
|
and not re.match(r"^\d{4}$", w) # strip year
|
||||||
|
and not re.match(r"^\d+$", w) # strip pure numbers
|
||||||
|
and len(w) > 1
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_words(title: str) -> list[str]:
|
||||||
|
"""Normalize a Calibre-Web title for comparison."""
|
||||||
|
title = title.lower()
|
||||||
|
title = re.sub(r"[^\w\s]", "", title)
|
||||||
|
return [w for w in title.split() if w not in _JUNK_WORDS and len(w) > 1]
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_opds_titles(xml: str) -> list[str]:
|
||||||
|
"""Extract book titles from an OPDS Atom feed, skipping the feed title itself."""
|
||||||
|
# Grab all <title> elements; the first is the feed title ("Search results"), rest are books
|
||||||
|
titles = re.findall(r"<title>([^<]+)</title>", xml)
|
||||||
|
return titles[1:] if len(titles) > 1 else []
|
||||||
|
|
||||||
|
|
||||||
def _extract_csrf(html: str) -> str | None:
|
def _extract_csrf(html: str) -> str | None:
|
||||||
m = re.search(r'name="csrf_token"\s+value="([^"]+)"', html)
|
m = re.search(r'name="csrf_token"\s+value="([^"]+)"', html)
|
||||||
if not m:
|
if not m:
|
||||||
|
|||||||
Reference in New Issue
Block a user