Skip to content

Commit

Permalink
fix(epub, word): Update the documents cache whenever the file is modi…
Browse files Browse the repository at this point in the history
…fied (#289)

* Added failing test

* Implemented modification detection
  • Loading branch information
pauliyobo authored Dec 15, 2024
1 parent 8aade8f commit ee7d4f0
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 9 deletions.
23 changes: 23 additions & 0 deletions bookworm/document/cache_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Caching utilities"""
from pathlib import Path

from diskcache import Cache


def is_document_modified(key: str, path: Path, cache: Cache) -> bool:
"""
Checks whether a particular document was modified
We can currently afford to naively just stat() the file_path in order to determine it
"""
mtime = cache.get(f"{key}_meta")
if not mtime:
# No information for the book was found, so return True just to be safe
# TODO: Is this acceptable?
return True
stat_mtime = path.stat().st_mtime
return mtime != stat_mtime

def set_document_modified_time(key: str, path: Path, cache: Cache) -> bool:
key = f"{key}_meta"
cache.set(key, path.stat().st_mtime)

7 changes: 5 additions & 2 deletions bookworm/document/formats/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from lxml import html as lxml_html
from selectolax.parser import HTMLParser

from bookworm.document import cache_utils
from bookworm.i18n import LocaleInfo
from bookworm.image_io import ImageIO
from bookworm.logger import logger
Expand Down Expand Up @@ -190,7 +191,7 @@ def epub_html_items(self) -> tuple[str]:
# As reported in issue 243
# We will now sort the items obtained earlier based on the position that the chapter itself occupies in the TOC
spine = [x[0].split('/')[-1] for x in self.epub.spine]
log.info(spine)
log.debug(spine)
try:
items = sorted(items, key=lambda x: spine.index(x.id))
except ValueError:
Expand Down Expand Up @@ -310,7 +311,8 @@ def html_content(self):
self._get_cache_directory(), eviction_policy="least-frequently-used"
)
cache_key = self.uri.to_uri_string()
if cached_html_content := cache.get(cache_key):
document_path = self.get_file_system_path()
if (cached_html_content := cache.get(cache_key)) and not cache_utils.is_document_modified(cache_key, document_path, cache):
return cached_html_content.decode("utf-8")
html_content_gen = (
(item.file_name, item.content) for item in self.epub_html_items
Expand All @@ -323,6 +325,7 @@ def html_content(self):
title=self.epub.title, body_content=buf.getvalue()
)
cache.set(cache_key, html_content.encode("utf-8"))
cache_utils.set_document_modified_time(cache_key, document_path, cache)
return html_content

def prefix_html_ids(self, filename, html):
Expand Down
5 changes: 4 additions & 1 deletion bookworm/document/formats/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from bookworm import app
from bookworm.concurrency import process_worker, threaded_worker
from bookworm.document import cache_utils
from bookworm.document.uri import DocumentUri
from bookworm.logger import logger
from bookworm.paths import app_path, home_data_path
Expand Down Expand Up @@ -85,17 +86,19 @@ def try_decrypt(self, data_buf, decryption_key):

def _get_html_content_from_docx(self, data_buf, is_encrypted_document):
data_buf.seek(0)
doc_path = self.get_file_system_path
cache = Cache(
self._get_cache_directory(), eviction_policy="least-frequently-used"
)
cache_key = self.uri.to_uri_string()
if cached_html_content := cache.get(cache_key):
if (cached_html_content := cache.get(cache_key)) and not cache_utils.is_document_modified(cache_key, doc_path, cache):
return cached_html_content.decode("utf-8")
result = mammoth.convert_to_html(data_buf, include_embedded_style_map=False)
data_buf.seek(0)
html_content = self.make_proper_html(result.value, data_buf)
if not is_encrypted_document:
cache.set(cache_key, html_content.encode("utf-8"))
cache_utils.set_document_modified_time(cache_key, doc_path, cache)
return html_content

def make_proper_html(self, html_string, data_buf):
Expand Down
55 changes: 49 additions & 6 deletions tests/test_epub.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,57 @@
from pathlib import Path

from ebooklib import epub
import pytest

from bookworm.document.uri import DocumentUri
from bookworm.document.formats.epub import EpubDocument

def temp_book(title: str = 'Sample book') -> epub.EpubBook:
book = epub.EpubBook()
book.set_title("test book")
book.set_language('en')
c1 = epub.EpubHtml(title="Intro", file_name="chap_01.xhtml", lang="en")
c1.content = (
"<h1>This is a test</h1>"
)
book.add_item(c1)
book.toc = (
epub.Link("chap_01.xhtml", "Introduction", "intro"),
(epub.Section("Simple book"), (c1,)),
)

book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.spine = ["nav", c1]
return book


def test_chapter_order_is_unchanged_with_roman_numbers(asset):
epub = EpubDocument(DocumentUri.from_filename(asset('roman.epub')))
epub.read()
spine = [x[0] for x in epub.epub.spine]
items = [x.file_name.split('/')[-1] for x in epub.epub_html_items]
print(items)
print(spine)
doc = EpubDocument(DocumentUri.from_filename(asset('roman.epub')))
doc.read()
spine = [x[0] for x in doc.epub.spine]
items = [x.file_name.split('/')[-1] for x in doc.epub_html_items]
assert spine == items

def test_modified_epub_modifies_cache(asset):
book = temp_book()
epub.write_epub(asset("test.epub"), book, {})
doc = EpubDocument(DocumentUri.from_filename(asset('test.epub')))
doc.read()
content = doc.html_content

# Let's now add a second chapter, and see whether the document read modifies its cache
c2 = epub.EpubHtml(title="Second chapter", file_name="chap_02.xhtml", lang="en")
c2.content = (
"<h1>This is another test</h1>"
)
book.add_item(c2)
book.spine.append(c2)
epub.write_epub(asset("test.epub"), book, {})

# read the book once more, and verify that the content is different
doc = EpubDocument(DocumentUri.from_filename(asset('test.epub')))
doc.read()
new_content = doc.html_content
Path(asset("test.epub")).unlink()
assert content != new_content

0 comments on commit ee7d4f0

Please sign in to comment.