Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ jobs:
steps:
- uses: actions/checkout@v6
with:
token: ${{ secrets.RELEASE_PAT }}
submodules: recursive
fetch-depth: 0

Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## [Unreleased]

### Added
- EPUB layout now supports CJK book text with SD-card fonts, including character-level wrapping and basic CJK punctuation line-break rules.

### Changed

Expand Down
177 changes: 118 additions & 59 deletions lib/Epub/Epub/ParsedText.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,102 @@ bool isWordCharacter(uint32_t cp) {

} // namespace

void ParsedText::reserveWordCapacity(const size_t requiredSize) {
if (words.capacity() >= requiredSize) {
return;
}

size_t newCapacity = words.capacity() * 2;
if (newCapacity < requiredSize) {
newCapacity = requiredSize;
}
if (newCapacity < 16) {
newCapacity = 16;
}

words.reserve(newCapacity);
wordStyles.reserve(newCapacity);
wordContinues.reserve(newCapacity);
wordNoBreakBefore.reserve(newCapacity);
wordIsBionicSuffix.reserve(newCapacity);
wordIsGuideDot.reserve(newCapacity);
wordBackgroundBlack.reserve(newCapacity);
}

void ParsedText::pushToken(std::string_view token, const EpdFontFamily::Style style, const bool continues,
const bool noBreakBefore, const bool bionicSuffix, const bool guideDot,
const bool backgroundBlack) {
if (token.empty()) {
return;
}
words.emplace_back(token);
wordStyles.push_back(style);
wordContinues.push_back(continues);
wordNoBreakBefore.push_back(noBreakBefore);
wordIsBionicSuffix.push_back(bionicSuffix);
wordIsGuideDot.push_back(guideDot);
wordBackgroundBlack.push_back(backgroundBlack);
}

bool ParsedText::addCjkAwareWord(std::string_view word, const EpdFontFamily::Style baseStyle,
const bool attachToPrevious, const bool backgroundBlack) {
const auto* ptr = reinterpret_cast<const unsigned char*>(word.data());
const auto* end = ptr + word.size();
bool containsCjk = false;
while (ptr < end) {
const uint32_t cp = utf8NextCodepoint(&ptr);
if (utf8IsCjkBreakable(cp)) {
containsCjk = true;
break;
}
}

if (!containsCjk) {
return false;
}

reserveWordCapacity(words.size() + word.size());

ptr = reinterpret_cast<const unsigned char*>(word.data());
const unsigned char* segmentStart = ptr;
bool isFirstToken = true;
bool openingNeedsNextToken = false;

auto flushNonCjkSegment = [&](const unsigned char* segmentEnd) {
if (segmentEnd <= segmentStart) {
return;
}
const bool continues = isFirstToken ? attachToPrevious : true;
const bool noBreakBefore = (isFirstToken && attachToPrevious) || openingNeedsNextToken;
pushToken(std::string_view(reinterpret_cast<const char*>(segmentStart), segmentEnd - segmentStart), baseStyle,
continues, noBreakBefore, false, false, backgroundBlack);
isFirstToken = false;
openingNeedsNextToken = false;
};

while (ptr < end) {
const unsigned char* cpStart = ptr;
const uint32_t cp = utf8NextCodepoint(&ptr);
if (!utf8IsCjkBreakable(cp)) {
continue;
}

flushNonCjkSegment(cpStart);

const bool continues = isFirstToken ? attachToPrevious : true;
const bool noBreakBefore =
(isFirstToken && attachToPrevious) || openingNeedsNextToken || utf8IsCjkClosingPunctuation(cp);
pushToken(std::string_view(reinterpret_cast<const char*>(cpStart), ptr - cpStart), baseStyle, continues,
noBreakBefore, false, false, backgroundBlack);
isFirstToken = false;
openingNeedsNextToken = utf8IsCjkOpeningPunctuation(cp);
segmentStart = ptr;
}

flushNonCjkSegment(end);
return true;
}

void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline,
const bool attachToPrevious, const bool backgroundBlack) {
if (word.empty()) return;
Expand All @@ -154,24 +250,21 @@ void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle,
const bool wordStartsRtl = !hasRtlWord && mayContainRtlBytes(word.c_str()) &&
BidiUtils::startsWithRtl(word.c_str(), RTL_PER_WORD_PROBE_DEPTH);

if (addCjkAwareWord(word, baseStyle, attachToPrevious, backgroundBlack)) {
if (wordStartsRtl) {
hasRtlWord = true;
}
return;
}

// GUIDE READING: insert middle dot (U+00B7) between non-continuation words.
if (guideReadingEnabled && !attachToPrevious && !words.empty()) {
words.emplace_back("\xc2\xb7");
wordStyles.push_back(EpdFontFamily::REGULAR);
wordContinues.push_back(false);
wordIsBionicSuffix.push_back(false);
wordIsGuideDot.push_back(true);
wordBackgroundBlack.push_back(false);
pushToken("\xc2\xb7", EpdFontFamily::REGULAR, false, true, false, true, false);
}

// Already-bold text should stay fully bold; bionic splitting would make its suffix regular later.
if (!this->bionicReadingEnabled || (baseStyle & EpdFontFamily::BOLD) != 0) {
words.push_back(std::move(word));
wordStyles.push_back(baseStyle);
wordContinues.push_back(attachToPrevious);
wordIsBionicSuffix.push_back(false);
wordIsGuideDot.push_back(false);
wordBackgroundBlack.push_back(backgroundBlack);
pushToken(word, baseStyle, attachToPrevious, attachToPrevious, false, false, backgroundBlack);
if (wordStartsRtl) {
hasRtlWord = true;
}
Expand All @@ -183,39 +276,14 @@ void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle,
// Pre-reserve capacity to prevent mid-word heap reallocations.
size_t maxPossibleNewTokens = word.length();
size_t requiredSize = words.size() + maxPossibleNewTokens;

if (words.capacity() < requiredSize) {
// Emulate standard geometric growth (doubling) to ensure we don't reallocate on every word.
size_t newCapacity = words.capacity() * 2;

// Ensure the doubled capacity is actually enough for this specific word
if (newCapacity < requiredSize) {
newCapacity = requiredSize;
}
// Set a sensible minimum starting size so the first few words don't trigger tiny reallocations
if (newCapacity < 16) {
newCapacity = 16;
}

words.reserve(newCapacity);
wordStyles.reserve(newCapacity);
wordContinues.reserve(newCapacity);
wordIsBionicSuffix.reserve(newCapacity);
wordIsGuideDot.reserve(newCapacity);
wordBackgroundBlack.reserve(newCapacity);
}
reserveWordCapacity(requiredSize);

// Lambda helper to process and push individual sub-segments of the string
// Use std::string_view to avoid heap allocations when slicing
auto processSegment = [&](std::string_view segment, bool isWord, bool attach) {
if (!isWord) {
// Punctuation and Numbers stay regular
words.emplace_back(segment);
wordStyles.push_back(baseStyle);
wordContinues.push_back(attach);
wordIsBionicSuffix.push_back(false);
wordIsGuideDot.push_back(false);
wordBackgroundBlack.push_back(backgroundBlack);
pushToken(segment, baseStyle, attach, attach, false, false, backgroundBlack);
} else {
size_t charCount = 0;
const unsigned char* countPtr = reinterpret_cast<const unsigned char*>(segment.data());
Expand All @@ -233,12 +301,8 @@ void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle,

if (targetBoldChars >= charCount) {
// Whole segment is bold - no suffix split needed
words.emplace_back(segment);
wordStyles.push_back(static_cast<EpdFontFamily::Style>(baseStyle | EpdFontFamily::BOLD));
wordContinues.push_back(attach);
wordIsBionicSuffix.push_back(false);
wordIsGuideDot.push_back(false);
wordBackgroundBlack.push_back(backgroundBlack);
pushToken(segment, static_cast<EpdFontFamily::Style>(baseStyle | EpdFontFamily::BOLD), attach, attach, false,
false, backgroundBlack);
} else {
countPtr = reinterpret_cast<const unsigned char*>(segment.data());
for (size_t i = 0; i < targetBoldChars; ++i) {
Expand All @@ -247,20 +311,12 @@ void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle,
size_t splitByteOffset = countPtr - reinterpret_cast<const unsigned char*>(segment.data());

// Bold prefix
words.emplace_back(segment.substr(0, splitByteOffset));
wordStyles.push_back(static_cast<EpdFontFamily::Style>(baseStyle | EpdFontFamily::BOLD));
wordContinues.push_back(attach);
wordIsBionicSuffix.push_back(false);
wordIsGuideDot.push_back(false);
wordBackgroundBlack.push_back(backgroundBlack);
pushToken(segment.substr(0, splitByteOffset),
static_cast<EpdFontFamily::Style>(baseStyle | EpdFontFamily::BOLD), attach, attach, false, false,
backgroundBlack);

// Regular suffix - marked so extractLine can merge it back into one TextBlock entry
words.emplace_back(segment.substr(splitByteOffset));
wordStyles.push_back(baseStyle);
wordContinues.push_back(true);
wordIsBionicSuffix.push_back(true);
wordIsGuideDot.push_back(false);
wordBackgroundBlack.push_back(backgroundBlack);
pushToken(segment.substr(splitByteOffset), baseStyle, true, true, true, false, backgroundBlack);
}
}
};
Expand Down Expand Up @@ -381,6 +437,7 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
words.erase(words.begin(), words.begin() + consumed);
wordStyles.erase(wordStyles.begin(), wordStyles.begin() + consumed);
wordContinues.erase(wordContinues.begin(), wordContinues.begin() + consumed);
wordNoBreakBefore.erase(wordNoBreakBefore.begin(), wordNoBreakBefore.begin() + consumed);
wordIsBionicSuffix.erase(wordIsBionicSuffix.begin(), wordIsBionicSuffix.begin() + consumed);
wordIsGuideDot.erase(wordIsGuideDot.begin(), wordIsGuideDot.begin() + consumed);
wordBackgroundBlack.erase(wordBackgroundBlack.begin(), wordBackgroundBlack.begin() + consumed);
Expand All @@ -405,7 +462,7 @@ std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, c
}

auto nextTokenAttaches = [&](const size_t index, const size_t totalWordCount) {
return index + 1 < totalWordCount && (continuesVec[index + 1] || wordIsGuideDot[index + 1]);
return index + 1 < totalWordCount && (wordNoBreakBefore[index + 1] || wordIsGuideDot[index + 1]);
};

const int firstLineIndent = resolveFirstLineIndent(true, renderer, fontId);
Expand Down Expand Up @@ -524,7 +581,7 @@ std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r
size_t currentIndex = 0;
bool isFirstLine = true;
auto currentTokenAttaches = [&](const size_t index) {
return index < wordWidths.size() && (continuesVec[index] || wordIsGuideDot[index]);
return index < wordWidths.size() && (wordNoBreakBefore[index] || wordIsGuideDot[index]);
};

while (currentIndex < wordWidths.size()) {
Expand Down Expand Up @@ -652,6 +709,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
// The hyphen remainder is neither a bionic suffix nor a guide dot - it starts fresh on the next line.
wordIsBionicSuffix.insert(wordIsBionicSuffix.begin() + wordIndex + 1, false);
wordIsGuideDot.insert(wordIsGuideDot.begin() + wordIndex + 1, false);
wordNoBreakBefore.insert(wordNoBreakBefore.begin() + wordIndex + 1, false);

// Continuation flag handling after splitting a word into prefix + remainder.
//
Expand Down Expand Up @@ -726,6 +784,7 @@ bool ParsedText::splitPathologicalTokenAtIndex(const size_t wordIndex, const int
wordBackgroundBlack.insert(wordBackgroundBlack.begin() + wordIndex + 1, wordBackgroundBlack[wordIndex]);
wordIsBionicSuffix.insert(wordIsBionicSuffix.begin() + wordIndex + 1, false);
wordIsGuideDot.insert(wordIsGuideDot.begin() + wordIndex + 1, false);
wordNoBreakBefore.insert(wordNoBreakBefore.begin() + wordIndex + 1, false);
wordContinues.insert(wordContinues.begin() + wordIndex + 1, false);

wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
Expand Down
7 changes: 7 additions & 0 deletions lib/Epub/Epub/ParsedText.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <functional>
#include <memory>
#include <string>
#include <string_view>
#include <vector>

#include "blocks/BlockStyle.h"
Expand All @@ -16,6 +17,7 @@ class ParsedText {
std::vector<std::string> words;
std::vector<EpdFontFamily::Style> wordStyles;
std::vector<bool> wordContinues; // true = word attaches to previous (no space before it)
std::vector<bool> wordNoBreakBefore; // true = line breaker cannot start a line at this token
std::vector<bool> wordIsBionicSuffix; // true = token is the regular tail of a bionic bold-prefix split
std::vector<bool> wordIsGuideDot; // true = token is a guide dot (U+00B7) inserted between words
std::vector<uint8_t> wordBackgroundBlack;
Expand All @@ -35,6 +37,11 @@ class ParsedText {
std::vector<uint8_t> reorderedBackgroundBlackScratch;
std::vector<uint16_t> visualOrderScratch;

void reserveWordCapacity(size_t requiredSize);
void pushToken(std::string_view token, EpdFontFamily::Style style, bool continues, bool noBreakBefore,
bool bionicSuffix, bool guideDot, bool backgroundBlack);
bool addCjkAwareWord(std::string_view word, EpdFontFamily::Style baseStyle, bool attachToPrevious,
bool backgroundBlack);
int resolveFirstLineIndent(bool isFirstLine, const GfxRenderer& renderer, int fontId) const;
std::vector<size_t> computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth,
std::vector<uint16_t>& wordWidths, std::vector<bool>& continuesVec);
Expand Down
29 changes: 23 additions & 6 deletions lib/Epub/Epub/Section.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <Logging.h>
#include <MemoryBudget.h>
#include <Serialization.h>
#include <Utf8.h>

#include "Epub/css/CssParser.h"
#include "Page.h"
Expand All @@ -13,7 +14,7 @@

namespace {
constexpr uint32_t SECTION_CACHE_MAGIC = 0x535843FF; // bytes: 0xFF, "CXS"
constexpr uint8_t SECTION_FILE_VERSION = 40;
constexpr uint8_t SECTION_FILE_VERSION = 41;
constexpr uint8_t INITIAL_PAGE_LUT_RESERVE = 32;
constexpr uint32_t HEADER_SIZE = sizeof(SECTION_CACHE_MAGIC) + sizeof(uint8_t) + sizeof(int) + sizeof(float) +
sizeof(bool) + sizeof(bool) + sizeof(uint8_t) + sizeof(uint16_t) + sizeof(uint16_t) +
Expand All @@ -26,6 +27,10 @@ struct PageLutEntry {
uint16_t paragraphIndex;
uint16_t listItemIndex;
};

bool shouldDisableLatinReadingAssistForCjk(const std::shared_ptr<Epub>& epub) {
return epub && utf8LanguageTagIsCjk(epub->getLanguage());
}
} // namespace

uint32_t Section::onPageComplete(std::unique_ptr<Page> page) {
Expand Down Expand Up @@ -89,6 +94,9 @@ bool Section::loadSectionFile(const int fontId, const float lineCompression, con
const uint16_t viewportWidth, const uint16_t viewportHeight,
const bool hyphenationEnabled, const bool embeddedStyle, const uint8_t imageRendering,
const bool bionicReadingEnabled, const bool guideReadingEnabled) {
const bool cjkOptimized = shouldDisableLatinReadingAssistForCjk(epub);
const bool effectiveBionicReadingEnabled = bionicReadingEnabled && !cjkOptimized;
const bool effectiveGuideReadingEnabled = guideReadingEnabled && !cjkOptimized;
if (!Storage.openFileForRead("SCT", filePath, file)) {
return false;
}
Expand Down Expand Up @@ -155,7 +163,8 @@ bool Section::loadSectionFile(const int fontId, const float lineCompression, con
paragraphAlignment != fileParagraphAlignment || viewportWidth != fileViewportWidth ||
viewportHeight != fileViewportHeight || hyphenationEnabled != fileHyphenationEnabled ||
embeddedStyle != fileEmbeddedStyle || imageRendering != fileImageRendering ||
bionicReadingEnabled != fileBionicReadingEnabled || guideReadingEnabled != fileGuideReadingEnabled) {
effectiveBionicReadingEnabled != fileBionicReadingEnabled ||
effectiveGuideReadingEnabled != fileGuideReadingEnabled) {
file.close();
LOG_ERR("SCT", "Deserialization failed: Parameters do not match");
clearCache();
Expand Down Expand Up @@ -203,9 +212,16 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c
const auto tmpSectionPath = filePath + ".tmp";
pageCount = 0;
if (layoutAbortedForLowMemory) *layoutAbortedForLowMemory = false;
const bool cjkOptimized = shouldDisableLatinReadingAssistForCjk(epub);
const bool effectiveBionicReadingEnabled = bionicReadingEnabled && !cjkOptimized;
const bool effectiveGuideReadingEnabled = guideReadingEnabled && !cjkOptimized;
LOG_DBG("SCT", "Create section start: spine=%d viewport=%ux%u image=%u bionic=%u guide=%u free=%u maxAlloc=%u",
spineIndex, viewportWidth, viewportHeight, imageRendering, bionicReadingEnabled, guideReadingEnabled,
ESP.getFreeHeap(), ESP.getMaxAllocHeap());
spineIndex, viewportWidth, viewportHeight, imageRendering, effectiveBionicReadingEnabled,
effectiveGuideReadingEnabled, ESP.getFreeHeap(), ESP.getMaxAllocHeap());
if (cjkOptimized && (bionicReadingEnabled || guideReadingEnabled)) {
LOG_DBG("SCT", "CJK language detected (%s); disabling bionic/guide reading for section layout",
epub->getLanguage().c_str());
}

// Create cache directory if it doesn't exist
{
Expand Down Expand Up @@ -260,7 +276,7 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c
}
if (!writeSectionFileHeader(fontId, lineCompression, extraParagraphSpacing, forceParagraphIndents, paragraphAlignment,
viewportWidth, viewportHeight, hyphenationEnabled, embeddedStyle, imageRendering,
bionicReadingEnabled, guideReadingEnabled)) {
effectiveBionicReadingEnabled, effectiveGuideReadingEnabled)) {
LOG_ERR("SCT", "Failed to write section header");
file.close();
Storage.remove(tmpSectionPath.c_str());
Expand Down Expand Up @@ -308,7 +324,8 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c

ChapterHtmlSlimParser visitor(
epub, tmpHtmlPath, renderer, fontId, lineCompression, extraParagraphSpacing, forceParagraphIndents,
paragraphAlignment, viewportWidth, viewportHeight, hyphenationEnabled, bionicReadingEnabled, guideReadingEnabled,
paragraphAlignment, viewportWidth, viewportHeight, hyphenationEnabled, effectiveBionicReadingEnabled,
effectiveGuideReadingEnabled,
[this, &lut](std::unique_ptr<Page> page, const uint16_t paragraphIndex, const uint16_t listItemIndex) {
lut.push_back({this->onPageComplete(std::move(page)), paragraphIndex, listItemIndex});
},
Expand Down
2 changes: 1 addition & 1 deletion lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1831,7 +1831,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
}

// If we're about to run out of space, then cut the word off and start a new one.
// For CJK text (no spaces), this is the primary word-breaking mechanism.
// ParsedText performs CJK-aware tokenization after this flushes a complete UTF-8 chunk.
// We must avoid splitting multi-byte UTF-8 sequences across word boundaries,
// otherwise the trailing bytes become orphaned continuation bytes that the
// decoder can't interpret.
Expand Down
Loading
Loading