diff --git a/README.md b/README.md index 5d868d7..71a7385 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,10 @@ This project was 90% vibe coded just to illustrate how one can very easily [read ## Usage -The project uses [uv](https://docs.astral.sh/uv/). So for example, download [Dracula EPUB3](https://www.gutenberg.org/ebooks/345) to this directory as `dracula.epub`, then: +The project uses [uv](https://docs.astral.sh/uv/). So for example, download [Dracula EPUB3](https://www.gutenberg.org/ebooks/345) to this directory as `books/dracula.epub`, then: ```bash -uv run reader3.py dracula.epub +uv run reader3.py ``` This creates the directory `dracula_data`, which registers the book to your local library. We can then run the server: diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/books/.gitkeep b/books/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/delete_data_folders.py b/delete_data_folders.py new file mode 100644 index 0000000..a51cbbb --- /dev/null +++ b/delete_data_folders.py @@ -0,0 +1,17 @@ +import os +import shutil + +BOOKS_DIR = "books" + +def delete_data_folders(): + count = 0 + for item in os.listdir(BOOKS_DIR): + item_path = os.path.join(BOOKS_DIR, item) + if item.endswith("_data") and os.path.isdir(item_path): + print(f"Deleting {item_path} ...") + shutil.rmtree(item_path) + count += 1 + print(f"Deleted {count} data folders.") + +if __name__ == "__main__": + delete_data_folders() diff --git a/reader3.py b/reader3.py index d0b9d3f..c9eb984 100644 --- a/reader3.py +++ b/reader3.py @@ -6,7 +6,7 @@ import pickle import shutil from dataclasses import dataclass, field -from typing import List, Dict, Optional, Any +from typing import List, Dict, Optional from datetime import datetime from urllib.parse import unquote @@ -16,33 +16,37 @@ # --- Data structures --- + @dataclass class ChapterContent: """ Represents a physical file in the EPUB (Spine Item). A single file might contain multiple logical chapters (TOC entries). """ - id: str # Internal ID (e.g., 'item_1') - href: str # Filename (e.g., 'part01.html') - title: str # Best guess title from file - content: str # Cleaned HTML with rewritten image paths - text: str # Plain text for search/LLM context - order: int # Linear reading order + + id: str # Internal ID (e.g., 'item_1') + href: str # Filename (e.g., 'part01.html') + title: str # Best guess title from file + content: str # Cleaned HTML with rewritten image paths + text: str # Plain text for search/LLM context + order: int # Linear reading order @dataclass class TOCEntry: """Represents a logical entry in the navigation sidebar.""" + title: str - href: str # original href (e.g., 'part01.html#chapter1') - file_href: str # just the filename (e.g., 'part01.html') - anchor: str # just the anchor (e.g., 'chapter1'), empty if none - children: List['TOCEntry'] = field(default_factory=list) + href: str # original href (e.g., 'part01.html#chapter1') + file_href: str # just the filename (e.g., 'part01.html') + anchor: str # just the anchor (e.g., 'chapter1'), empty if none + children: List["TOCEntry"] = field(default_factory=list) @dataclass class BookMetadata: """Metadata""" + title: str language: str authors: List[str] = field(default_factory=list) @@ -51,15 +55,17 @@ class BookMetadata: date: Optional[str] = None identifiers: List[str] = field(default_factory=list) subjects: List[str] = field(default_factory=list) + cover: Optional[str] = None # Filename of cover image @dataclass class Book: """The Master Object to be pickled.""" + metadata: BookMetadata spine: List[ChapterContent] # The actual content (linear files) - toc: List[TOCEntry] # The navigation tree - images: Dict[str, str] # Map: original_path -> local_path + toc: List[TOCEntry] # The navigation tree + images: Dict[str, str] # Map: original_path -> local_path # Meta info source_file: str @@ -67,12 +73,17 @@ class Book: version: str = "3.0" +# Ensure pickles always point to the importable module name, even when run as a script. +for _cls in (ChapterContent, TOCEntry, BookMetadata, Book): + _cls.__module__ = "reader3" + + # --- Utilities --- -def clean_html_content(soup: BeautifulSoup) -> BeautifulSoup: +def clean_html_content(soup: BeautifulSoup) -> BeautifulSoup: # Remove dangerous/useless tags - for tag in soup(['script', 'style', 'iframe', 'video', 'nav', 'form', 'button']): + for tag in soup(["script", "style", "iframe", "video", "nav", "form", "button"]): tag.decompose() # Remove HTML comments @@ -80,7 +91,7 @@ def clean_html_content(soup: BeautifulSoup) -> BeautifulSoup: comment.extract() # Remove input tags - for tag in soup.find_all('input'): + for tag in soup.find_all("input"): tag.decompose() return soup @@ -88,9 +99,9 @@ def clean_html_content(soup: BeautifulSoup) -> BeautifulSoup: def extract_plain_text(soup: BeautifulSoup) -> str: """Extract clean text for LLM/Search usage.""" - text = soup.get_text(separator=' ') + text = soup.get_text(separator=" ") # Collapse whitespace - return ' '.join(text.split()) + return " ".join(text.split()) def parse_toc_recursive(toc_list, depth=0) -> List[TOCEntry]: @@ -106,28 +117,28 @@ def parse_toc_recursive(toc_list, depth=0) -> List[TOCEntry]: entry = TOCEntry( title=section.title, href=section.href, - file_href=section.href.split('#')[0], - anchor=section.href.split('#')[1] if '#' in section.href else "", - children=parse_toc_recursive(children, depth + 1) + file_href=section.href.split("#")[0], + anchor=section.href.split("#")[1] if "#" in section.href else "", + children=parse_toc_recursive(children, depth + 1), ) result.append(entry) elif isinstance(item, epub.Link): entry = TOCEntry( title=item.title, href=item.href, - file_href=item.href.split('#')[0], - anchor=item.href.split('#')[1] if '#' in item.href else "" + file_href=item.href.split("#")[0], + anchor=item.href.split("#")[1] if "#" in item.href else "", ) result.append(entry) # Note: ebooklib sometimes returns direct Section objects without children elif isinstance(item, epub.Section): - entry = TOCEntry( + entry = TOCEntry( title=item.title, href=item.href, - file_href=item.href.split('#')[0], - anchor=item.href.split('#')[1] if '#' in item.href else "" + file_href=item.href.split("#")[0], + anchor=item.href.split("#")[1] if "#" in item.href else "", ) - result.append(entry) + result.append(entry) return result @@ -141,7 +152,13 @@ def get_fallback_toc(book_obj) -> List[TOCEntry]: if item.get_type() == ebooklib.ITEM_DOCUMENT: name = item.get_name() # Try to guess a title from the content or ID - title = item.get_name().replace('.html', '').replace('.xhtml', '').replace('_', ' ').title() + title = ( + item.get_name() + .replace(".html", "") + .replace(".xhtml", "") + .replace("_", " ") + .title() + ) toc.append(TOCEntry(title=title, href=name, file_href=name, anchor="")) return toc @@ -150,30 +167,32 @@ def extract_metadata_robust(book_obj) -> BookMetadata: """ Extracts metadata handling both single and list values. """ + def get_list(key): - data = book_obj.get_metadata('DC', key) + data = book_obj.get_metadata("DC", key) return [x[0] for x in data] if data else [] def get_one(key): - data = book_obj.get_metadata('DC', key) + data = book_obj.get_metadata("DC", key) return data[0][0] if data else None return BookMetadata( - title=get_one('title') or "Untitled", - language=get_one('language') or "en", - authors=get_list('creator'), - description=get_one('description'), - publisher=get_one('publisher'), - date=get_one('date'), - identifiers=get_list('identifier'), - subjects=get_list('subject') + title=get_one("title") or "Untitled", + language=get_one("language") or "en", + authors=get_list("creator"), + description=get_one("description"), + publisher=get_one("publisher"), + date=get_one("date"), + identifiers=get_list("identifier"), + subjects=get_list("subject"), + cover=None, # Will be set during image processing ) # --- Main Conversion Logic --- -def process_epub(epub_path: str, output_dir: str) -> Book: +def process_epub(epub_path: str, output_dir: str) -> Book: # 1. Load Book print(f"Loading {epub_path}...") book = epub.read_epub(epub_path) @@ -184,23 +203,60 @@ def process_epub(epub_path: str, output_dir: str) -> Book: # 3. Prepare Output Directories if os.path.exists(output_dir): shutil.rmtree(output_dir) - images_dir = os.path.join(output_dir, 'images') + images_dir = os.path.join(output_dir, "images") os.makedirs(images_dir, exist_ok=True) # 4. Extract Images & Build Map print("Extracting images...") - image_map = {} # Key: internal_path, Value: local_relative_path + image_map = {} # Key: internal_path, Value: local_relative_path + cover_filename = None # Track cover image + + # Try to find cover image from metadata + cover_id = None + for meta in book.get_metadata("OPF", "cover"): + if meta and meta[0]: + cover_id = meta[0] + break + + # Also check for item with 'cover' in properties or id, or ISBN pattern + import re + + isbn_pattern = re.compile(r"^978.*\.(jpg|jpeg)$", re.IGNORECASE) + + for item in book.get_items(): + if item.get_type() == ebooklib.ITEM_IMAGE: + item_id = item.get_id() + item_name = os.path.basename(item.get_name()) + item_name_lower = item_name.lower() + + # Priority 1: Cover ID from metadata + if cover_id and item_id == cover_id: + cover_filename = item_name + # Priority 2: Filename is exactly "cover.png" or similar + elif ( + item_name_lower in ("cover.png", "cover.jpg", "cover.jpeg") + and not cover_filename + ): + cover_filename = item_name + # Priority 3: Filename contains "cover" + elif "cover" in item_name_lower and not cover_filename: + cover_filename = item_name + # Priority 4: ISBN pattern (978*.jpg or 978*.jpeg) + elif isbn_pattern.match(item_name) and not cover_filename: + cover_filename = item_name for item in book.get_items(): if item.get_type() == ebooklib.ITEM_IMAGE: # Normalize filename original_fname = os.path.basename(item.get_name()) # Sanitize filename for OS - safe_fname = "".join([c for c in original_fname if c.isalpha() or c.isdigit() or c in '._-']).strip() + safe_fname = "".join( + [c for c in original_fname if c.isalpha() or c.isdigit() or c in "._-"] + ).strip() # Save to disk local_path = os.path.join(images_dir, safe_fname) - with open(local_path, 'wb') as f: + with open(local_path, "wb") as f: f.write(item.get_content()) # Map keys: We try both the full internal path and just the basename @@ -209,6 +265,15 @@ def process_epub(epub_path: str, output_dir: str) -> Book: image_map[item.get_name()] = rel_path image_map[original_fname] = rel_path + # Track sanitized cover filename + if cover_filename and original_fname == cover_filename: + cover_filename = safe_fname + + # Set cover in metadata + if cover_filename: + metadata.cover = cover_filename + print(f"Cover image found: {cover_filename}") + # 5. Process TOC print("Parsing Table of Contents...") toc_structure = parse_toc_recursive(book.toc) @@ -230,13 +295,14 @@ def process_epub(epub_path: str, output_dir: str) -> Book: if item.get_type() == ebooklib.ITEM_DOCUMENT: # Raw content - raw_content = item.get_content().decode('utf-8', errors='ignore') - soup = BeautifulSoup(raw_content, 'html.parser') + raw_content = item.get_content().decode("utf-8", errors="ignore") + soup = BeautifulSoup(raw_content, "html.parser") # A. Fix Images - for img in soup.find_all('img'): - src = img.get('src', '') - if not src: continue + for img in soup.find_all("img"): + src = img.get("src", "") + if not src: + continue # Decode URL (part01/image%201.jpg -> part01/image 1.jpg) src_decoded = unquote(src) @@ -244,15 +310,15 @@ def process_epub(epub_path: str, output_dir: str) -> Book: # Try to find in map if src_decoded in image_map: - img['src'] = image_map[src_decoded] + img["src"] = image_map[src_decoded] elif filename in image_map: - img['src'] = image_map[filename] + img["src"] = image_map[filename] # B. Clean HTML soup = clean_html_content(soup) # C. Extract Body Content only - body = soup.find('body') + body = soup.find("body") if body: # Extract inner HTML of body final_html = "".join([str(x) for x in body.contents]) @@ -262,11 +328,11 @@ def process_epub(epub_path: str, output_dir: str) -> Book: # D. Create Object chapter = ChapterContent( id=item_id, - href=item.get_name(), # Important: This links TOC to Content - title=f"Section {i+1}", # Fallback, real titles come from TOC + href=item.get_name(), # Important: This links TOC to Content + title=f"Section {i + 1}", # Fallback, real titles come from TOC content=final_html, text=extract_plain_text(soup), - order=i + order=i, ) spine_chapters.append(chapter) @@ -277,37 +343,40 @@ def process_epub(epub_path: str, output_dir: str) -> Book: toc=toc_structure, images=image_map, source_file=os.path.basename(epub_path), - processed_at=datetime.now().isoformat() + processed_at=datetime.now().isoformat(), ) return final_book def save_to_pickle(book: Book, output_dir: str): - p_path = os.path.join(output_dir, 'book.pkl') - with open(p_path, 'wb') as f: + p_path = os.path.join(output_dir, "book.pkl") + with open(p_path, "wb") as f: pickle.dump(book, f) print(f"Saved structured data to {p_path}") + # --- CLI --- -if __name__ == "__main__": +def main(): + """Process all EPUB files in the books/ directory and save pickles.""" + for epub_file in [ + os.path.join("books", f) for f in os.listdir("books/") if f.endswith(".epub") + ]: + assert os.path.exists(epub_file), "File not found." + out_dir = os.path.splitext(epub_file)[0] + "_data" - import sys - if len(sys.argv) < 2: - print("Usage: python reader3.py ") - sys.exit(1) - - epub_file = sys.argv[1] - assert os.path.exists(epub_file), "File not found." - out_dir = os.path.splitext(epub_file)[0] + "_data" - - book_obj = process_epub(epub_file, out_dir) - save_to_pickle(book_obj, out_dir) - print("\n--- Summary ---") - print(f"Title: {book_obj.metadata.title}") - print(f"Authors: {', '.join(book_obj.metadata.authors)}") - print(f"Physical Files (Spine): {len(book_obj.spine)}") - print(f"TOC Root Items: {len(book_obj.toc)}") - print(f"Images extracted: {len(book_obj.images)}") + book_obj = process_epub(epub_file, out_dir) + save_to_pickle(book_obj, out_dir) + print("\n--- Summary ---") + print(f"Title: {book_obj.metadata.title}") + print(f"Authors: {', '.join(book_obj.metadata.authors)}") + print(f"Physical Files (Spine): {len(book_obj.spine)}") + print(f"TOC Root Items: {len(book_obj.toc)}") + print(f"Images extracted: {len(book_obj.images)}") + + +if __name__ == "__main__": + # Recommend running as: python -m reader3 + main() diff --git a/server.py b/server.py index 9c870dc..fe7cbff 100644 --- a/server.py +++ b/server.py @@ -5,16 +5,28 @@ from fastapi import FastAPI, Request, HTTPException from fastapi.responses import HTMLResponse, FileResponse -from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates -from reader3 import Book, BookMetadata, ChapterContent, TOCEntry +from reader3 import Book, BookMetadata, ChapterContent, TOCEntry # noqa: F401 - needed for pickle app = FastAPI() templates = Jinja2Templates(directory="templates") # Where are the book folders located? -BOOKS_DIR = "." +BOOKS_DIR = "books" + + +class ReaderUnpickler(pickle.Unpickler): + """ + Allow loading pickle files that were created when reader3.py + was executed as a script (module name becomes '__main__'). + """ + + def find_class(self, module, name): + if module == "__main__": + module = "reader3" + return super().find_class(module, name) + @lru_cache(maxsize=10) def load_book_cached(folder_name: str) -> Optional[Book]: @@ -28,12 +40,13 @@ def load_book_cached(folder_name: str) -> Optional[Book]: try: with open(file_path, "rb") as f: - book = pickle.load(f) + book = ReaderUnpickler(f).load() return book except Exception as e: print(f"Error loading book {folder_name}: {e}") return None + @app.get("/", response_class=HTMLResponse) async def library_view(request: Request): """Lists all available processed books.""" @@ -42,24 +55,32 @@ async def library_view(request: Request): # Scan directory for folders ending in '_data' that have a book.pkl if os.path.exists(BOOKS_DIR): for item in os.listdir(BOOKS_DIR): - if item.endswith("_data") and os.path.isdir(item): + item_path = os.path.join(BOOKS_DIR, item) + if item.endswith("_data") and os.path.isdir(item_path): # Try to load it to get the title book = load_book_cached(item) if book: - books.append({ - "id": item, - "title": book.metadata.title, - "author": ", ".join(book.metadata.authors), - "chapters": len(book.spine) - }) + books.append( + { + "id": item, + "title": book.metadata.title, + "author": ", ".join(book.metadata.authors), + "chapters": len(book.spine), + "cover": book.metadata.cover, + } + ) + + return templates.TemplateResponse( + "library.html", {"request": request, "books": books} + ) - return templates.TemplateResponse("library.html", {"request": request, "books": books}) @app.get("/read/{book_id}", response_class=HTMLResponse) async def redirect_to_first_chapter(book_id: str): """Helper to just go to chapter 0.""" return await read_chapter(book_id=book_id, chapter_index=0) + @app.get("/read/{book_id}/{chapter_index}", response_class=HTMLResponse) async def read_chapter(request: Request, book_id: str, chapter_index: int): """The main reader interface.""" @@ -76,15 +97,19 @@ async def read_chapter(request: Request, book_id: str, chapter_index: int): prev_idx = chapter_index - 1 if chapter_index > 0 else None next_idx = chapter_index + 1 if chapter_index < len(book.spine) - 1 else None - return templates.TemplateResponse("reader.html", { - "request": request, - "book": book, - "current_chapter": current_chapter, - "chapter_index": chapter_index, - "book_id": book_id, - "prev_idx": prev_idx, - "next_idx": next_idx - }) + return templates.TemplateResponse( + "reader.html", + { + "request": request, + "book": book, + "current_chapter": current_chapter, + "chapter_index": chapter_index, + "book_id": book_id, + "prev_idx": prev_idx, + "next_idx": next_idx, + }, + ) + @app.get("/read/{book_id}/images/{image_name}") async def serve_image(book_id: str, image_name: str): @@ -104,7 +129,23 @@ async def serve_image(book_id: str, image_name: str): return FileResponse(img_path) + +@app.get("/cover/{book_id}/{image_name}") +async def serve_cover(book_id: str, image_name: str): + """Serves cover images for the library view.""" + safe_book_id = os.path.basename(book_id) + safe_image_name = os.path.basename(image_name) + + img_path = os.path.join(BOOKS_DIR, safe_book_id, "images", safe_image_name) + + if not os.path.exists(img_path): + raise HTTPException(status_code=404, detail="Cover not found") + + return FileResponse(img_path) + + if __name__ == "__main__": import uvicorn + print("Starting server at http://127.0.0.1:8123") uvicorn.run(app, host="127.0.0.1", port=8123) diff --git a/templates/library.html b/templates/library.html index e7d094d..60584da 100644 --- a/templates/library.html +++ b/templates/library.html @@ -6,12 +6,16 @@ My Library @@ -27,12 +31,19 @@

Library

{% for book in books %}
-
{{ book.title }}
-
- {{ book.author }}
- {{ book.chapters }} sections + {% if book.cover %} + {{ book.title }} + {% else %} +
{{ book.title[0] }}
+ {% endif %} +
+
{{ book.title }}
+
+ {{ book.author }}
+ {{ book.chapters }} sections +
+ Read Book
- Read Book
{% endfor %}
diff --git a/templates/reader.html b/templates/reader.html index c012edc..a1aaf23 100644 --- a/templates/reader.html +++ b/templates/reader.html @@ -36,6 +36,27 @@ .nav-btn:hover { background: #3498db; color: white; } .nav-btn.disabled { opacity: 0.5; pointer-events: none; border-color: #ccc; color: #ccc; } + /* Copy Button */ + .copy-btn { + position: fixed; + top: 20px; + right: 20px; + background: #2ecc71; + color: white; + border: none; + padding: 10px 18px; + border-radius: 6px; + cursor: pointer; + font-family: -apple-system, sans-serif; + font-size: 0.9em; + font-weight: 500; + box-shadow: 0 2px 8px rgba(0,0,0,0.15); + transition: all 0.2s; + z-index: 1000; + } + .copy-btn:hover { background: #27ae60; transform: translateY(-1px); box-shadow: 0 4px 12px rgba(0,0,0,0.2); } + .copy-btn.copied { background: #3498db; } + @@ -96,6 +117,7 @@
+
{{ current_chapter.content | safe }} @@ -122,6 +144,28 @@