From 7910ea38ef0c33a9b943b954a07f6d48c35025d8 Mon Sep 17 00:00:00 2001 From: Julia Nguyen Date: Mon, 15 Jun 2026 09:09:39 -0400 Subject: [PATCH 1/2] ci: update release workflow for catalog publishing --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6fac881dec..c13717a21a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,6 +16,7 @@ jobs: steps: - uses: actions/checkout@v6 with: + token: ${{ secrets.RELEASE_PAT }} submodules: recursive fetch-depth: 0 From 243418ad159507df313a11169c3a07a5d0e7b1d9 Mon Sep 17 00:00:00 2001 From: Julia Nguyen Date: Mon, 15 Jun 2026 15:56:28 -0400 Subject: [PATCH 2/2] feat: support cjk epub wrapping --- CHANGELOG.md | 1 + lib/Epub/Epub/ParsedText.cpp | 177 ++++++++++++------ lib/Epub/Epub/ParsedText.h | 7 + lib/Epub/Epub/Section.cpp | 29 ++- .../Epub/parsers/ChapterHtmlSlimParser.cpp | 2 +- lib/Utf8/Utf8.h | 63 +++++++ src/activities/reader/EpubReaderActivity.cpp | 12 +- test/CMakeLists.txt | 1 + test/utf8_cjk/CMakeLists.txt | 15 ++ test/utf8_cjk/Utf8CjkTest.cpp | 37 ++++ 10 files changed, 277 insertions(+), 67 deletions(-) create mode 100644 test/utf8_cjk/CMakeLists.txt create mode 100644 test/utf8_cjk/Utf8CjkTest.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index ddd7118f3d..5620e2af6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## [Unreleased] ### Added +- EPUB layout now supports CJK book text with SD-card fonts, including character-level wrapping and basic CJK punctuation line-break rules. ### Changed diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 320112f679..a804889f8d 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -143,6 +143,102 @@ bool isWordCharacter(uint32_t cp) { } // namespace +void ParsedText::reserveWordCapacity(const size_t requiredSize) { + if (words.capacity() >= requiredSize) { + return; + } + + size_t newCapacity = words.capacity() * 2; + if (newCapacity < requiredSize) { + newCapacity = requiredSize; + } + if (newCapacity < 16) { + newCapacity = 16; + } + + words.reserve(newCapacity); + wordStyles.reserve(newCapacity); + wordContinues.reserve(newCapacity); + wordNoBreakBefore.reserve(newCapacity); + wordIsBionicSuffix.reserve(newCapacity); + wordIsGuideDot.reserve(newCapacity); + wordBackgroundBlack.reserve(newCapacity); +} + +void ParsedText::pushToken(std::string_view token, const EpdFontFamily::Style style, const bool continues, + const bool noBreakBefore, const bool bionicSuffix, const bool guideDot, + const bool backgroundBlack) { + if (token.empty()) { + return; + } + words.emplace_back(token); + wordStyles.push_back(style); + wordContinues.push_back(continues); + wordNoBreakBefore.push_back(noBreakBefore); + wordIsBionicSuffix.push_back(bionicSuffix); + wordIsGuideDot.push_back(guideDot); + wordBackgroundBlack.push_back(backgroundBlack); +} + +bool ParsedText::addCjkAwareWord(std::string_view word, const EpdFontFamily::Style baseStyle, + const bool attachToPrevious, const bool backgroundBlack) { + const auto* ptr = reinterpret_cast(word.data()); + const auto* end = ptr + word.size(); + bool containsCjk = false; + while (ptr < end) { + const uint32_t cp = utf8NextCodepoint(&ptr); + if (utf8IsCjkBreakable(cp)) { + containsCjk = true; + break; + } + } + + if (!containsCjk) { + return false; + } + + reserveWordCapacity(words.size() + word.size()); + + ptr = reinterpret_cast(word.data()); + const unsigned char* segmentStart = ptr; + bool isFirstToken = true; + bool openingNeedsNextToken = false; + + auto flushNonCjkSegment = [&](const unsigned char* segmentEnd) { + if (segmentEnd <= segmentStart) { + return; + } + const bool continues = isFirstToken ? attachToPrevious : true; + const bool noBreakBefore = (isFirstToken && attachToPrevious) || openingNeedsNextToken; + pushToken(std::string_view(reinterpret_cast(segmentStart), segmentEnd - segmentStart), baseStyle, + continues, noBreakBefore, false, false, backgroundBlack); + isFirstToken = false; + openingNeedsNextToken = false; + }; + + while (ptr < end) { + const unsigned char* cpStart = ptr; + const uint32_t cp = utf8NextCodepoint(&ptr); + if (!utf8IsCjkBreakable(cp)) { + continue; + } + + flushNonCjkSegment(cpStart); + + const bool continues = isFirstToken ? attachToPrevious : true; + const bool noBreakBefore = + (isFirstToken && attachToPrevious) || openingNeedsNextToken || utf8IsCjkClosingPunctuation(cp); + pushToken(std::string_view(reinterpret_cast(cpStart), ptr - cpStart), baseStyle, continues, + noBreakBefore, false, false, backgroundBlack); + isFirstToken = false; + openingNeedsNextToken = utf8IsCjkOpeningPunctuation(cp); + segmentStart = ptr; + } + + flushNonCjkSegment(end); + return true; +} + void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline, const bool attachToPrevious, const bool backgroundBlack) { if (word.empty()) return; @@ -154,24 +250,21 @@ void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool wordStartsRtl = !hasRtlWord && mayContainRtlBytes(word.c_str()) && BidiUtils::startsWithRtl(word.c_str(), RTL_PER_WORD_PROBE_DEPTH); + if (addCjkAwareWord(word, baseStyle, attachToPrevious, backgroundBlack)) { + if (wordStartsRtl) { + hasRtlWord = true; + } + return; + } + // GUIDE READING: insert middle dot (U+00B7) between non-continuation words. if (guideReadingEnabled && !attachToPrevious && !words.empty()) { - words.emplace_back("\xc2\xb7"); - wordStyles.push_back(EpdFontFamily::REGULAR); - wordContinues.push_back(false); - wordIsBionicSuffix.push_back(false); - wordIsGuideDot.push_back(true); - wordBackgroundBlack.push_back(false); + pushToken("\xc2\xb7", EpdFontFamily::REGULAR, false, true, false, true, false); } // Already-bold text should stay fully bold; bionic splitting would make its suffix regular later. if (!this->bionicReadingEnabled || (baseStyle & EpdFontFamily::BOLD) != 0) { - words.push_back(std::move(word)); - wordStyles.push_back(baseStyle); - wordContinues.push_back(attachToPrevious); - wordIsBionicSuffix.push_back(false); - wordIsGuideDot.push_back(false); - wordBackgroundBlack.push_back(backgroundBlack); + pushToken(word, baseStyle, attachToPrevious, attachToPrevious, false, false, backgroundBlack); if (wordStartsRtl) { hasRtlWord = true; } @@ -183,39 +276,14 @@ void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, // Pre-reserve capacity to prevent mid-word heap reallocations. size_t maxPossibleNewTokens = word.length(); size_t requiredSize = words.size() + maxPossibleNewTokens; - - if (words.capacity() < requiredSize) { - // Emulate standard geometric growth (doubling) to ensure we don't reallocate on every word. - size_t newCapacity = words.capacity() * 2; - - // Ensure the doubled capacity is actually enough for this specific word - if (newCapacity < requiredSize) { - newCapacity = requiredSize; - } - // Set a sensible minimum starting size so the first few words don't trigger tiny reallocations - if (newCapacity < 16) { - newCapacity = 16; - } - - words.reserve(newCapacity); - wordStyles.reserve(newCapacity); - wordContinues.reserve(newCapacity); - wordIsBionicSuffix.reserve(newCapacity); - wordIsGuideDot.reserve(newCapacity); - wordBackgroundBlack.reserve(newCapacity); - } + reserveWordCapacity(requiredSize); // Lambda helper to process and push individual sub-segments of the string // Use std::string_view to avoid heap allocations when slicing auto processSegment = [&](std::string_view segment, bool isWord, bool attach) { if (!isWord) { // Punctuation and Numbers stay regular - words.emplace_back(segment); - wordStyles.push_back(baseStyle); - wordContinues.push_back(attach); - wordIsBionicSuffix.push_back(false); - wordIsGuideDot.push_back(false); - wordBackgroundBlack.push_back(backgroundBlack); + pushToken(segment, baseStyle, attach, attach, false, false, backgroundBlack); } else { size_t charCount = 0; const unsigned char* countPtr = reinterpret_cast(segment.data()); @@ -233,12 +301,8 @@ void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, if (targetBoldChars >= charCount) { // Whole segment is bold - no suffix split needed - words.emplace_back(segment); - wordStyles.push_back(static_cast(baseStyle | EpdFontFamily::BOLD)); - wordContinues.push_back(attach); - wordIsBionicSuffix.push_back(false); - wordIsGuideDot.push_back(false); - wordBackgroundBlack.push_back(backgroundBlack); + pushToken(segment, static_cast(baseStyle | EpdFontFamily::BOLD), attach, attach, false, + false, backgroundBlack); } else { countPtr = reinterpret_cast(segment.data()); for (size_t i = 0; i < targetBoldChars; ++i) { @@ -247,20 +311,12 @@ void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, size_t splitByteOffset = countPtr - reinterpret_cast(segment.data()); // Bold prefix - words.emplace_back(segment.substr(0, splitByteOffset)); - wordStyles.push_back(static_cast(baseStyle | EpdFontFamily::BOLD)); - wordContinues.push_back(attach); - wordIsBionicSuffix.push_back(false); - wordIsGuideDot.push_back(false); - wordBackgroundBlack.push_back(backgroundBlack); + pushToken(segment.substr(0, splitByteOffset), + static_cast(baseStyle | EpdFontFamily::BOLD), attach, attach, false, false, + backgroundBlack); // Regular suffix - marked so extractLine can merge it back into one TextBlock entry - words.emplace_back(segment.substr(splitByteOffset)); - wordStyles.push_back(baseStyle); - wordContinues.push_back(true); - wordIsBionicSuffix.push_back(true); - wordIsGuideDot.push_back(false); - wordBackgroundBlack.push_back(backgroundBlack); + pushToken(segment.substr(splitByteOffset), baseStyle, true, true, true, false, backgroundBlack); } } }; @@ -381,6 +437,7 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo words.erase(words.begin(), words.begin() + consumed); wordStyles.erase(wordStyles.begin(), wordStyles.begin() + consumed); wordContinues.erase(wordContinues.begin(), wordContinues.begin() + consumed); + wordNoBreakBefore.erase(wordNoBreakBefore.begin(), wordNoBreakBefore.begin() + consumed); wordIsBionicSuffix.erase(wordIsBionicSuffix.begin(), wordIsBionicSuffix.begin() + consumed); wordIsGuideDot.erase(wordIsGuideDot.begin(), wordIsGuideDot.begin() + consumed); wordBackgroundBlack.erase(wordBackgroundBlack.begin(), wordBackgroundBlack.begin() + consumed); @@ -405,7 +462,7 @@ std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, c } auto nextTokenAttaches = [&](const size_t index, const size_t totalWordCount) { - return index + 1 < totalWordCount && (continuesVec[index + 1] || wordIsGuideDot[index + 1]); + return index + 1 < totalWordCount && (wordNoBreakBefore[index + 1] || wordIsGuideDot[index + 1]); }; const int firstLineIndent = resolveFirstLineIndent(true, renderer, fontId); @@ -524,7 +581,7 @@ std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r size_t currentIndex = 0; bool isFirstLine = true; auto currentTokenAttaches = [&](const size_t index) { - return index < wordWidths.size() && (continuesVec[index] || wordIsGuideDot[index]); + return index < wordWidths.size() && (wordNoBreakBefore[index] || wordIsGuideDot[index]); }; while (currentIndex < wordWidths.size()) { @@ -652,6 +709,7 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl // The hyphen remainder is neither a bionic suffix nor a guide dot - it starts fresh on the next line. wordIsBionicSuffix.insert(wordIsBionicSuffix.begin() + wordIndex + 1, false); wordIsGuideDot.insert(wordIsGuideDot.begin() + wordIndex + 1, false); + wordNoBreakBefore.insert(wordNoBreakBefore.begin() + wordIndex + 1, false); // Continuation flag handling after splitting a word into prefix + remainder. // @@ -726,6 +784,7 @@ bool ParsedText::splitPathologicalTokenAtIndex(const size_t wordIndex, const int wordBackgroundBlack.insert(wordBackgroundBlack.begin() + wordIndex + 1, wordBackgroundBlack[wordIndex]); wordIsBionicSuffix.insert(wordIsBionicSuffix.begin() + wordIndex + 1, false); wordIsGuideDot.insert(wordIsGuideDot.begin() + wordIndex + 1, false); + wordNoBreakBefore.insert(wordNoBreakBefore.begin() + wordIndex + 1, false); wordContinues.insert(wordContinues.begin() + wordIndex + 1, false); wordWidths[wordIndex] = static_cast(chosenWidth); diff --git a/lib/Epub/Epub/ParsedText.h b/lib/Epub/Epub/ParsedText.h index 92ae676a73..7ef72838cd 100644 --- a/lib/Epub/Epub/ParsedText.h +++ b/lib/Epub/Epub/ParsedText.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "blocks/BlockStyle.h" @@ -16,6 +17,7 @@ class ParsedText { std::vector words; std::vector wordStyles; std::vector wordContinues; // true = word attaches to previous (no space before it) + std::vector wordNoBreakBefore; // true = line breaker cannot start a line at this token std::vector wordIsBionicSuffix; // true = token is the regular tail of a bionic bold-prefix split std::vector wordIsGuideDot; // true = token is a guide dot (U+00B7) inserted between words std::vector wordBackgroundBlack; @@ -35,6 +37,11 @@ class ParsedText { std::vector reorderedBackgroundBlackScratch; std::vector visualOrderScratch; + void reserveWordCapacity(size_t requiredSize); + void pushToken(std::string_view token, EpdFontFamily::Style style, bool continues, bool noBreakBefore, + bool bionicSuffix, bool guideDot, bool backgroundBlack); + bool addCjkAwareWord(std::string_view word, EpdFontFamily::Style baseStyle, bool attachToPrevious, + bool backgroundBlack); int resolveFirstLineIndent(bool isFirstLine, const GfxRenderer& renderer, int fontId) const; std::vector computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, std::vector& wordWidths, std::vector& continuesVec); diff --git a/lib/Epub/Epub/Section.cpp b/lib/Epub/Epub/Section.cpp index 68fe10a45c..ca742a5475 100644 --- a/lib/Epub/Epub/Section.cpp +++ b/lib/Epub/Epub/Section.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "Epub/css/CssParser.h" #include "Page.h" @@ -13,7 +14,7 @@ namespace { constexpr uint32_t SECTION_CACHE_MAGIC = 0x535843FF; // bytes: 0xFF, "CXS" -constexpr uint8_t SECTION_FILE_VERSION = 40; +constexpr uint8_t SECTION_FILE_VERSION = 41; constexpr uint8_t INITIAL_PAGE_LUT_RESERVE = 32; constexpr uint32_t HEADER_SIZE = sizeof(SECTION_CACHE_MAGIC) + sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(bool) + sizeof(uint8_t) + sizeof(uint16_t) + sizeof(uint16_t) + @@ -26,6 +27,10 @@ struct PageLutEntry { uint16_t paragraphIndex; uint16_t listItemIndex; }; + +bool shouldDisableLatinReadingAssistForCjk(const std::shared_ptr& epub) { + return epub && utf8LanguageTagIsCjk(epub->getLanguage()); +} } // namespace uint32_t Section::onPageComplete(std::unique_ptr page) { @@ -89,6 +94,9 @@ bool Section::loadSectionFile(const int fontId, const float lineCompression, con const uint16_t viewportWidth, const uint16_t viewportHeight, const bool hyphenationEnabled, const bool embeddedStyle, const uint8_t imageRendering, const bool bionicReadingEnabled, const bool guideReadingEnabled) { + const bool cjkOptimized = shouldDisableLatinReadingAssistForCjk(epub); + const bool effectiveBionicReadingEnabled = bionicReadingEnabled && !cjkOptimized; + const bool effectiveGuideReadingEnabled = guideReadingEnabled && !cjkOptimized; if (!Storage.openFileForRead("SCT", filePath, file)) { return false; } @@ -155,7 +163,8 @@ bool Section::loadSectionFile(const int fontId, const float lineCompression, con paragraphAlignment != fileParagraphAlignment || viewportWidth != fileViewportWidth || viewportHeight != fileViewportHeight || hyphenationEnabled != fileHyphenationEnabled || embeddedStyle != fileEmbeddedStyle || imageRendering != fileImageRendering || - bionicReadingEnabled != fileBionicReadingEnabled || guideReadingEnabled != fileGuideReadingEnabled) { + effectiveBionicReadingEnabled != fileBionicReadingEnabled || + effectiveGuideReadingEnabled != fileGuideReadingEnabled) { file.close(); LOG_ERR("SCT", "Deserialization failed: Parameters do not match"); clearCache(); @@ -203,9 +212,16 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c const auto tmpSectionPath = filePath + ".tmp"; pageCount = 0; if (layoutAbortedForLowMemory) *layoutAbortedForLowMemory = false; + const bool cjkOptimized = shouldDisableLatinReadingAssistForCjk(epub); + const bool effectiveBionicReadingEnabled = bionicReadingEnabled && !cjkOptimized; + const bool effectiveGuideReadingEnabled = guideReadingEnabled && !cjkOptimized; LOG_DBG("SCT", "Create section start: spine=%d viewport=%ux%u image=%u bionic=%u guide=%u free=%u maxAlloc=%u", - spineIndex, viewportWidth, viewportHeight, imageRendering, bionicReadingEnabled, guideReadingEnabled, - ESP.getFreeHeap(), ESP.getMaxAllocHeap()); + spineIndex, viewportWidth, viewportHeight, imageRendering, effectiveBionicReadingEnabled, + effectiveGuideReadingEnabled, ESP.getFreeHeap(), ESP.getMaxAllocHeap()); + if (cjkOptimized && (bionicReadingEnabled || guideReadingEnabled)) { + LOG_DBG("SCT", "CJK language detected (%s); disabling bionic/guide reading for section layout", + epub->getLanguage().c_str()); + } // Create cache directory if it doesn't exist { @@ -260,7 +276,7 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c } if (!writeSectionFileHeader(fontId, lineCompression, extraParagraphSpacing, forceParagraphIndents, paragraphAlignment, viewportWidth, viewportHeight, hyphenationEnabled, embeddedStyle, imageRendering, - bionicReadingEnabled, guideReadingEnabled)) { + effectiveBionicReadingEnabled, effectiveGuideReadingEnabled)) { LOG_ERR("SCT", "Failed to write section header"); file.close(); Storage.remove(tmpSectionPath.c_str()); @@ -308,7 +324,8 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c ChapterHtmlSlimParser visitor( epub, tmpHtmlPath, renderer, fontId, lineCompression, extraParagraphSpacing, forceParagraphIndents, - paragraphAlignment, viewportWidth, viewportHeight, hyphenationEnabled, bionicReadingEnabled, guideReadingEnabled, + paragraphAlignment, viewportWidth, viewportHeight, hyphenationEnabled, effectiveBionicReadingEnabled, + effectiveGuideReadingEnabled, [this, &lut](std::unique_ptr page, const uint16_t paragraphIndex, const uint16_t listItemIndex) { lut.push_back({this->onPageComplete(std::move(page)), paragraphIndex, listItemIndex}); }, diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index b335ffe254..2f77348729 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -1831,7 +1831,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char } // If we're about to run out of space, then cut the word off and start a new one. - // For CJK text (no spaces), this is the primary word-breaking mechanism. + // ParsedText performs CJK-aware tokenization after this flushes a complete UTF-8 chunk. // We must avoid splitting multi-byte UTF-8 sequences across word boundaries, // otherwise the trailing bytes become orphaned continuation bytes that the // decoder can't interpret. diff --git a/lib/Utf8/Utf8.h b/lib/Utf8/Utf8.h index e7238f8557..a489a11c43 100644 --- a/lib/Utf8/Utf8.h +++ b/lib/Utf8/Utf8.h @@ -35,6 +35,69 @@ inline bool utf8IsCjkBreakable(const uint32_t cp) { || (cp >= 0x2A700 && cp <= 0x2B73F); // CJK Extension C } +// Punctuation that should not be left hanging at the end of a line. +inline bool utf8IsCjkOpeningPunctuation(const uint32_t cp) { + switch (cp) { + case 0x3008: // left angle bracket + case 0x300A: // left double angle bracket + case 0x300C: // left corner bracket + case 0x300E: // left white corner bracket + case 0x3010: // left black lenticular bracket + case 0x3014: // left tortoise shell bracket + case 0x3016: // left white lenticular bracket + case 0x3018: // left white tortoise shell bracket + case 0x301A: // left white square bracket + case 0xFF08: // fullwidth left parenthesis + case 0xFF3B: // fullwidth left square bracket + case 0xFF5B: // fullwidth left curly bracket + return true; + default: + return false; + } +} + +// Punctuation that should not start a line. +inline bool utf8IsCjkClosingPunctuation(const uint32_t cp) { + switch (cp) { + case 0x3001: // ideographic comma + case 0x3002: // ideographic full stop + case 0x3009: // right angle bracket + case 0x300B: // right double angle bracket + case 0x300D: // right corner bracket + case 0x300F: // right white corner bracket + case 0x3011: // right black lenticular bracket + case 0x3015: // right tortoise shell bracket + case 0x3017: // right white lenticular bracket + case 0x3019: // right white tortoise shell bracket + case 0x301B: // right white square bracket + case 0xFF01: // fullwidth exclamation mark + case 0xFF09: // fullwidth right parenthesis + case 0xFF0C: // fullwidth comma + case 0xFF0E: // fullwidth full stop + case 0xFF1A: // fullwidth colon + case 0xFF1B: // fullwidth semicolon + case 0xFF1F: // fullwidth question mark + case 0xFF3D: // fullwidth right square bracket + case 0xFF5D: // fullwidth right curly bracket + return true; + default: + return false; + } +} + +inline bool utf8LanguageTagIsCjk(const std::string& languageTag) { + if (languageTag.size() < 2) return false; + const char first = languageTag[0] >= 'A' && languageTag[0] <= 'Z' ? languageTag[0] - 'A' + 'a' : languageTag[0]; + const char second = languageTag[1] >= 'A' && languageTag[1] <= 'Z' ? languageTag[1] - 'A' + 'a' : languageTag[1]; + if ((first == 'j' && second == 'a') || (first == 'z' && second == 'h') || (first == 'k' && second == 'o')) { + return true; + } + if (languageTag.size() < 3) return false; + const char third = languageTag[2] >= 'A' && languageTag[2] <= 'Z' ? languageTag[2] - 'A' + 'a' : languageTag[2]; + return (first == 'j' && second == 'p' && third == 'n') || (first == 'z' && second == 'h' && third == 'o') || + (first == 'c' && second == 'h' && third == 'i') || (first == 'k' && second == 'o' && third == 'r'); +} + // Returns true for Unicode combining diacritical marks that should not advance the cursor. inline bool utf8IsCombiningMark(const uint32_t cp) { return (cp >= 0x0300 && cp <= 0x036F) // Combining Diacritical Marks diff --git a/src/activities/reader/EpubReaderActivity.cpp b/src/activities/reader/EpubReaderActivity.cpp index c29c259608..90d584bece 100644 --- a/src/activities/reader/EpubReaderActivity.cpp +++ b/src/activities/reader/EpubReaderActivity.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -301,6 +302,10 @@ bool releaseReaderSdFontCachesForLowMemory(const GfxRenderer& renderer, const ch return true; } +bool shouldUseCjkLowMemoryRendering(const Epub* epub, const GfxRenderer& renderer, const int fontId) { + return epub && renderer.isSdCardFont(fontId) && utf8LanguageTagIsCjk(epub->getLanguage()); +} + int clampPercent(int percent) { if (percent < 0) { return 0; @@ -2842,11 +2847,16 @@ void EpubReaderActivity::renderContents(std::unique_ptr page, const int fo const bool pageHasImages = page->hasImages(); const bool foregroundBlack = ReaderUtils::readerForegroundBlack(); + const bool cjkLowMemoryRendering = shouldUseCjkLowMemoryRendering(epub.get(), renderer, fontId); const bool needsImageGrayscale = pageHasImages; - const bool needsTextGrayscale = SETTINGS.textAntiAliasing && foregroundBlack; + const bool needsTextGrayscale = SETTINGS.textAntiAliasing && foregroundBlack && !cjkLowMemoryRendering; const bool needsAnyGrayscale = needsTextGrayscale || needsImageGrayscale; const int contentBottom = renderer.getScreenHeight() - orientedMarginBottom; + if (cjkLowMemoryRendering && SETTINGS.textAntiAliasing && foregroundBlack) { + LOG_DBG("ERS", "CJK SD-font page render: skipping text anti-alias grayscale to preserve heap"); + } + const auto finalizeBufferComposition = [&]() { drawPublisherPageMarkers(renderer, *page, orientedMarginTop, contentBottom, foregroundBlack); }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 31ce26bfb1..d5a93fe518 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -42,3 +42,4 @@ add_subdirectory(streaming_json_parser) add_subdirectory(release_json_parser) add_subdirectory(differential_rounding) add_subdirectory(hyphenation_eval) +add_subdirectory(utf8_cjk) diff --git a/test/utf8_cjk/CMakeLists.txt b/test/utf8_cjk/CMakeLists.txt new file mode 100644 index 0000000000..8ab1b760fe --- /dev/null +++ b/test/utf8_cjk/CMakeLists.txt @@ -0,0 +1,15 @@ +add_executable(Utf8CjkTest + Utf8CjkTest.cpp + ${REPO_ROOT}/lib/Utf8/Utf8.cpp +) + +target_include_directories(Utf8CjkTest PRIVATE + ${REPO_ROOT}/lib/Utf8 +) + +target_link_libraries(Utf8CjkTest PRIVATE + crosspoint_test_common + GTest::gtest_main +) + +gtest_discover_tests(Utf8CjkTest) diff --git a/test/utf8_cjk/Utf8CjkTest.cpp b/test/utf8_cjk/Utf8CjkTest.cpp new file mode 100644 index 0000000000..7b5e06b4be --- /dev/null +++ b/test/utf8_cjk/Utf8CjkTest.cpp @@ -0,0 +1,37 @@ +#include +#include + +TEST(Utf8CjkTest, DetectsCjkBreakableRanges) { + EXPECT_TRUE(utf8IsCjkBreakable(0x4E16)); // CJK unified ideograph. + EXPECT_TRUE(utf8IsCjkBreakable(0x3042)); // Hiragana. + EXPECT_TRUE(utf8IsCjkBreakable(0xAC00)); // Hangul syllable. + EXPECT_TRUE(utf8IsCjkBreakable(0x20000)); // CJK Extension B. + + EXPECT_FALSE(utf8IsCjkBreakable('A')); + EXPECT_FALSE(utf8IsCjkBreakable(0x03BB)); // Greek lambda. +} + +TEST(Utf8CjkTest, ClassifiesPunctuationForLineBreakRules) { + EXPECT_TRUE(utf8IsCjkOpeningPunctuation(0x300C)); // left corner bracket. + EXPECT_TRUE(utf8IsCjkOpeningPunctuation(0xFF08)); // fullwidth left parenthesis. + EXPECT_FALSE(utf8IsCjkOpeningPunctuation(0x300D)); + + EXPECT_TRUE(utf8IsCjkClosingPunctuation(0x300D)); // right corner bracket. + EXPECT_TRUE(utf8IsCjkClosingPunctuation(0x3001)); // ideographic comma. + EXPECT_TRUE(utf8IsCjkClosingPunctuation(0xFF1F)); // fullwidth question mark. + EXPECT_FALSE(utf8IsCjkClosingPunctuation(0x300C)); +} + +TEST(Utf8CjkTest, DetectsCjkLanguageTags) { + EXPECT_TRUE(utf8LanguageTagIsCjk("ja")); + EXPECT_TRUE(utf8LanguageTagIsCjk("jpn")); + EXPECT_TRUE(utf8LanguageTagIsCjk("zh-Hans")); + EXPECT_TRUE(utf8LanguageTagIsCjk("zho")); + EXPECT_TRUE(utf8LanguageTagIsCjk("chi")); + EXPECT_TRUE(utf8LanguageTagIsCjk("KO")); + EXPECT_TRUE(utf8LanguageTagIsCjk("kor")); + + EXPECT_FALSE(utf8LanguageTagIsCjk("")); + EXPECT_FALSE(utf8LanguageTagIsCjk("en")); + EXPECT_FALSE(utf8LanguageTagIsCjk("de-DE")); +}