fix(epub, word): Update the documents cache whenever the file is modi…

…fied (#289) * Added failing test * Implemented modification detection
blindpandas · Dec 15, 2024 · ee7d4f0 · ee7d4f0
1 parent 8aade8f
commit ee7d4f0
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 9 deletions.
diff --git a/bookworm/document/cache_utils.py b/bookworm/document/cache_utils.py
@@ -0,0 +1,23 @@
+"""Caching utilities"""
+from pathlib import Path
+
+from diskcache import Cache
+
+
+def is_document_modified(key: str, path: Path, cache: Cache) -> bool:
+    """
+    Checks whether a particular document was modified
+    We can currently afford to naively just stat() the file_path in order to determine it
+    """
+    mtime = cache.get(f"{key}_meta")
+    if not mtime:
+        # No information for the book was found, so return True just to be safe
+        # TODO: Is this acceptable?
+        return True
+    stat_mtime = path.stat().st_mtime
+    return mtime != stat_mtime
+
+def set_document_modified_time(key: str, path: Path, cache: Cache) -> bool:
+    key = f"{key}_meta"
+    cache.set(key, path.stat().st_mtime)
+
diff --git a/bookworm/document/formats/epub.py b/bookworm/document/formats/epub.py
@@ -21,6 +21,7 @@
 from lxml import html as lxml_html
 from selectolax.parser import HTMLParser
 
+from bookworm.document import cache_utils
 from bookworm.i18n import LocaleInfo
 from bookworm.image_io import ImageIO
 from bookworm.logger import logger
@@ -190,7 +191,7 @@ def epub_html_items(self) -> tuple[str]:
         # As reported in issue 243
         # We will now sort the items obtained earlier based on the position that the chapter itself occupies in the TOC
         spine = [x[0].split('/')[-1] for x in self.epub.spine]
-        log.info(spine)
+        log.debug(spine)
         try:
             items = sorted(items, key=lambda x: spine.index(x.id))
         except ValueError:
@@ -310,7 +311,8 @@ def html_content(self):
             self._get_cache_directory(), eviction_policy="least-frequently-used"
         )
         cache_key = self.uri.to_uri_string()
-        if cached_html_content := cache.get(cache_key):
+        document_path = self.get_file_system_path()
+        if (cached_html_content := cache.get(cache_key)) and not cache_utils.is_document_modified(cache_key, document_path, cache):
             return cached_html_content.decode("utf-8")
         html_content_gen = (
             (item.file_name, item.content) for item in self.epub_html_items
@@ -323,6 +325,7 @@ def html_content(self):
             title=self.epub.title, body_content=buf.getvalue()
         )
         cache.set(cache_key, html_content.encode("utf-8"))
+        cache_utils.set_document_modified_time(cache_key, document_path, cache)
         return html_content
 
     def prefix_html_ids(self, filename, html):

diff --git a/bookworm/document/formats/word.py b/bookworm/document/formats/word.py
@@ -19,6 +19,7 @@
 
 from bookworm import app
 from bookworm.concurrency import process_worker, threaded_worker
+from bookworm.document import cache_utils
 from bookworm.document.uri import DocumentUri
 from bookworm.logger import logger
 from bookworm.paths import app_path, home_data_path
@@ -85,17 +86,19 @@ def try_decrypt(self, data_buf, decryption_key):
 
     def _get_html_content_from_docx(self, data_buf, is_encrypted_document):
         data_buf.seek(0)
+        doc_path = self.get_file_system_path
         cache = Cache(
             self._get_cache_directory(), eviction_policy="least-frequently-used"
         )
         cache_key = self.uri.to_uri_string()
-        if cached_html_content := cache.get(cache_key):
+        if (cached_html_content := cache.get(cache_key)) and not cache_utils.is_document_modified(cache_key, doc_path, cache):
             return cached_html_content.decode("utf-8")
         result = mammoth.convert_to_html(data_buf, include_embedded_style_map=False)
         data_buf.seek(0)
         html_content = self.make_proper_html(result.value, data_buf)
         if not is_encrypted_document:
             cache.set(cache_key, html_content.encode("utf-8"))
+            cache_utils.set_document_modified_time(cache_key, doc_path, cache)
         return html_content
 
     def make_proper_html(self, html_string, data_buf):

diff --git a/tests/test_epub.py b/tests/test_epub.py
@@ -1,14 +1,57 @@
+from pathlib import Path
+
+from ebooklib import epub
 import pytest
 
 from bookworm.document.uri import DocumentUri
 from bookworm.document.formats.epub import EpubDocument
 
+def temp_book(title: str = 'Sample book') -> epub.EpubBook:
+    book = epub.EpubBook()
+    book.set_title("test book")
+    book.set_language('en')
+    c1 = epub.EpubHtml(title="Intro", file_name="chap_01.xhtml", lang="en")
+    c1.content = (
+        "<h1>This is a test</h1>"
+    )
+    book.add_item(c1)
+    book.toc = (
+        epub.Link("chap_01.xhtml", "Introduction", "intro"),
+        (epub.Section("Simple book"), (c1,)),
+    )
+
+    book.add_item(epub.EpubNcx())
+    book.add_item(epub.EpubNav())
+    book.spine = ["nav", c1]
+    return book
+
 
 def test_chapter_order_is_unchanged_with_roman_numbers(asset):
-    epub = EpubDocument(DocumentUri.from_filename(asset('roman.epub')))
-    epub.read()
-    spine = [x[0] for x in epub.epub.spine]
-    items = [x.file_name.split('/')[-1] for x in epub.epub_html_items]
-    print(items)
-    print(spine)
+    doc = EpubDocument(DocumentUri.from_filename(asset('roman.epub')))
+    doc.read()
+    spine = [x[0] for x in doc.epub.spine]
+    items = [x.file_name.split('/')[-1] for x in doc.epub_html_items]
     assert spine == items
+
+def test_modified_epub_modifies_cache(asset):
+    book = temp_book()
+    epub.write_epub(asset("test.epub"), book, {})
+    doc = EpubDocument(DocumentUri.from_filename(asset('test.epub')))
+    doc.read()
+    content = doc.html_content
+
+    # Let's now add a second chapter, and see whether the document read modifies its cache
+    c2 = epub.EpubHtml(title="Second chapter", file_name="chap_02.xhtml", lang="en")
+    c2.content = (
+        "<h1>This is another test</h1>"
+    )
+    book.add_item(c2)
+    book.spine.append(c2)
+    epub.write_epub(asset("test.epub"), book, {})
+
+    # read the book once more, and verify that the content is different
+    doc = EpubDocument(DocumentUri.from_filename(asset('test.epub')))
+    doc.read()
+    new_content = doc.html_content
+    Path(asset("test.epub")).unlink()
+    assert content != new_content