From ee7d4f0167a5a3ec9233cf8d78fd3a8088e268a4 Mon Sep 17 00:00:00 2001 From: Paul Iyobo <34284755+pauliyobo@users.noreply.github.com> Date: Sun, 15 Dec 2024 03:55:31 +0100 Subject: [PATCH] fix(epub, word): Update the documents cache whenever the file is modified (#289) * Added failing test * Implemented modification detection --- bookworm/document/cache_utils.py | 23 +++++++++++++ bookworm/document/formats/epub.py | 7 ++-- bookworm/document/formats/word.py | 5 ++- tests/test_epub.py | 55 +++++++++++++++++++++++++++---- 4 files changed, 81 insertions(+), 9 deletions(-) create mode 100644 bookworm/document/cache_utils.py diff --git a/bookworm/document/cache_utils.py b/bookworm/document/cache_utils.py new file mode 100644 index 00000000..81cfec3a --- /dev/null +++ b/bookworm/document/cache_utils.py @@ -0,0 +1,23 @@ +"""Caching utilities""" +from pathlib import Path + +from diskcache import Cache + + +def is_document_modified(key: str, path: Path, cache: Cache) -> bool: + """ + Checks whether a particular document was modified + We can currently afford to naively just stat() the file_path in order to determine it + """ + mtime = cache.get(f"{key}_meta") + if not mtime: + # No information for the book was found, so return True just to be safe + # TODO: Is this acceptable? + return True + stat_mtime = path.stat().st_mtime + return mtime != stat_mtime + +def set_document_modified_time(key: str, path: Path, cache: Cache) -> bool: + key = f"{key}_meta" + cache.set(key, path.stat().st_mtime) + \ No newline at end of file diff --git a/bookworm/document/formats/epub.py b/bookworm/document/formats/epub.py index e6a99dac..3a55f6f5 100644 --- a/bookworm/document/formats/epub.py +++ b/bookworm/document/formats/epub.py @@ -21,6 +21,7 @@ from lxml import html as lxml_html from selectolax.parser import HTMLParser +from bookworm.document import cache_utils from bookworm.i18n import LocaleInfo from bookworm.image_io import ImageIO from bookworm.logger import logger @@ -190,7 +191,7 @@ def epub_html_items(self) -> tuple[str]: # As reported in issue 243 # We will now sort the items obtained earlier based on the position that the chapter itself occupies in the TOC spine = [x[0].split('/')[-1] for x in self.epub.spine] - log.info(spine) + log.debug(spine) try: items = sorted(items, key=lambda x: spine.index(x.id)) except ValueError: @@ -310,7 +311,8 @@ def html_content(self): self._get_cache_directory(), eviction_policy="least-frequently-used" ) cache_key = self.uri.to_uri_string() - if cached_html_content := cache.get(cache_key): + document_path = self.get_file_system_path() + if (cached_html_content := cache.get(cache_key)) and not cache_utils.is_document_modified(cache_key, document_path, cache): return cached_html_content.decode("utf-8") html_content_gen = ( (item.file_name, item.content) for item in self.epub_html_items @@ -323,6 +325,7 @@ def html_content(self): title=self.epub.title, body_content=buf.getvalue() ) cache.set(cache_key, html_content.encode("utf-8")) + cache_utils.set_document_modified_time(cache_key, document_path, cache) return html_content def prefix_html_ids(self, filename, html): diff --git a/bookworm/document/formats/word.py b/bookworm/document/formats/word.py index 94df1093..af0bb3f2 100644 --- a/bookworm/document/formats/word.py +++ b/bookworm/document/formats/word.py @@ -19,6 +19,7 @@ from bookworm import app from bookworm.concurrency import process_worker, threaded_worker +from bookworm.document import cache_utils from bookworm.document.uri import DocumentUri from bookworm.logger import logger from bookworm.paths import app_path, home_data_path @@ -85,17 +86,19 @@ def try_decrypt(self, data_buf, decryption_key): def _get_html_content_from_docx(self, data_buf, is_encrypted_document): data_buf.seek(0) + doc_path = self.get_file_system_path cache = Cache( self._get_cache_directory(), eviction_policy="least-frequently-used" ) cache_key = self.uri.to_uri_string() - if cached_html_content := cache.get(cache_key): + if (cached_html_content := cache.get(cache_key)) and not cache_utils.is_document_modified(cache_key, doc_path, cache): return cached_html_content.decode("utf-8") result = mammoth.convert_to_html(data_buf, include_embedded_style_map=False) data_buf.seek(0) html_content = self.make_proper_html(result.value, data_buf) if not is_encrypted_document: cache.set(cache_key, html_content.encode("utf-8")) + cache_utils.set_document_modified_time(cache_key, doc_path, cache) return html_content def make_proper_html(self, html_string, data_buf): diff --git a/tests/test_epub.py b/tests/test_epub.py index 442f9607..a298692f 100644 --- a/tests/test_epub.py +++ b/tests/test_epub.py @@ -1,14 +1,57 @@ +from pathlib import Path + +from ebooklib import epub import pytest from bookworm.document.uri import DocumentUri from bookworm.document.formats.epub import EpubDocument +def temp_book(title: str = 'Sample book') -> epub.EpubBook: + book = epub.EpubBook() + book.set_title("test book") + book.set_language('en') + c1 = epub.EpubHtml(title="Intro", file_name="chap_01.xhtml", lang="en") + c1.content = ( + "

This is a test

" + ) + book.add_item(c1) + book.toc = ( + epub.Link("chap_01.xhtml", "Introduction", "intro"), + (epub.Section("Simple book"), (c1,)), + ) + + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + book.spine = ["nav", c1] + return book + def test_chapter_order_is_unchanged_with_roman_numbers(asset): - epub = EpubDocument(DocumentUri.from_filename(asset('roman.epub'))) - epub.read() - spine = [x[0] for x in epub.epub.spine] - items = [x.file_name.split('/')[-1] for x in epub.epub_html_items] - print(items) - print(spine) + doc = EpubDocument(DocumentUri.from_filename(asset('roman.epub'))) + doc.read() + spine = [x[0] for x in doc.epub.spine] + items = [x.file_name.split('/')[-1] for x in doc.epub_html_items] assert spine == items + +def test_modified_epub_modifies_cache(asset): + book = temp_book() + epub.write_epub(asset("test.epub"), book, {}) + doc = EpubDocument(DocumentUri.from_filename(asset('test.epub'))) + doc.read() + content = doc.html_content + + # Let's now add a second chapter, and see whether the document read modifies its cache + c2 = epub.EpubHtml(title="Second chapter", file_name="chap_02.xhtml", lang="en") + c2.content = ( + "

This is another test

" + ) + book.add_item(c2) + book.spine.append(c2) + epub.write_epub(asset("test.epub"), book, {}) + + # read the book once more, and verify that the content is different + doc = EpubDocument(DocumentUri.from_filename(asset('test.epub'))) + doc.read() + new_content = doc.html_content + Path(asset("test.epub")).unlink() + assert content != new_content