diff --git a/lib/KOReaderSync/ProgressMapper.cpp b/lib/KOReaderSync/ProgressMapper.cpp index 146d5abe5d..2f1ab97f35 100644 --- a/lib/KOReaderSync/ProgressMapper.cpp +++ b/lib/KOReaderSync/ProgressMapper.cpp @@ -30,8 +30,7 @@ int parseIndex(const std::string& xpath, const char* prefix, bool last = false) int parseCharOffset(const std::string& xpath) { const size_t textPos = xpath.rfind("text()"); - if (textPos == std::string::npos) return 0; - const size_t dotPos = xpath.find('.', textPos); + const size_t dotPos = (textPos != std::string::npos) ? xpath.find('.', textPos) : xpath.rfind('.'); if (dotPos == std::string::npos || dotPos + 1 >= xpath.size()) return 0; int val = 0; for (size_t i = dotPos + 1; i < xpath.size(); i++) { @@ -104,7 +103,13 @@ bool isChapterStartXPath(const std::string& xpath) { if (dotPos == std::string::npos || dotPos <= bodyContentStart || dotPos + 1 >= xpath.size()) { return false; } - if (xpath.find('/', bodyContentStart) != std::string::npos) { + size_t terminalEnd = dotPos; + static constexpr char kTextNode[] = "/text()"; + const size_t textNodePos = xpath.rfind(kTextNode, dotPos); + if (textNodePos != std::string::npos && textNodePos >= bodyContentStart) { + terminalEnd = textNodePos; + } + if (xpath.find('/', bodyContentStart) < terminalEnd) { return false; } @@ -122,7 +127,7 @@ struct XPathStep { static constexpr int MAX_XPATH_DEPTH = 16; -// Parse the XPath segment between /body/DocFragment[N]/body/ and text()[N].offset +// Parse the XPath segment between /body/DocFragment[N]/body/ and the terminal position // into an ordered sequence of steps. Returns step count, 0 on failure. // Example input: "/body/DocFragment[1]/body/div[1]/ul/li[4]/text()[1].51" // Fills steps with: {div,1}, {ul,1}, {li,4} @@ -136,13 +141,20 @@ int parseXPathSteps(const std::string& xpath, XPathStep steps[MAX_XPATH_DEPTH]) if (xpath.compare(afterBracket + 1, strlen(kBody), kBody) != 0) return 0; size_t pos = afterBracket + 1 + strlen(kBody); - const size_t textPos = xpath.rfind("/text()"); - if (textPos == std::string::npos || textPos <= pos) return 0; + size_t stepsEnd = xpath.rfind("/text()"); + if (stepsEnd == std::string::npos) { + stepsEnd = xpath.rfind('.'); + if (stepsEnd == std::string::npos || stepsEnd <= pos || stepsEnd + 1 >= xpath.size()) return 0; + for (size_t i = stepsEnd + 1; i < xpath.size(); i++) { + if (xpath[i] < '0' || xpath[i] > '9') return 0; + } + } + if (stepsEnd <= pos) return 0; int count = 0; - while (pos < textPos && count < MAX_XPATH_DEPTH) { + while (pos < stepsEnd && count < MAX_XPATH_DEPTH) { const size_t slash = xpath.find('/', pos); - const size_t segEnd = (slash < textPos) ? slash : textPos; + const size_t segEnd = (slash < stepsEnd) ? slash : stepsEnd; XPathStep& step = steps[count]; const size_t bracket = xpath.find('[', pos); @@ -166,7 +178,7 @@ int parseXPathSteps(const std::string& xpath, XPathStep steps[MAX_XPATH_DEPTH]) } count++; - pos = (slash < textPos) ? slash + 1 : textPos; + pos = (slash < stepsEnd) ? slash + 1 : stepsEnd; } return count; } @@ -223,10 +235,148 @@ class ParagraphStreamer final : public Print { char capturedAnchorId[MAX_ANCHOR_ID] = {}; int capturedAnchorIdLen = 0; bool capturingAnchorTag = false; - enum IdScanState { ID_SCAN, ID_I, ID_D, ID_EQ, ID_IN_VALUE_D, ID_IN_VALUE_S } idState = ID_SCAN; + enum AnchorAttrState { + ATTR_FIND_NAME, + ATTR_READ_NAME, + ATTR_AFTER_NAME, + ATTR_BEFORE_VALUE, + ATTR_CAPTURE_D, + ATTR_CAPTURE_S + } attrState = ATTR_FIND_NAME; + uint8_t attrNameLen = 0; + bool currentAttrIsId = false; bool inAttrQuote = false; // true while inside a quoted attribute value (prevents '/' from being treated as self-close) char attrQuoteChar = 0; + uint8_t nonVisibleDepth = 0; + + bool isNonVisibleTag() const { + return strcasecmp(tagName, "head") == 0 || strcasecmp(tagName, "style") == 0 || + strcasecmp(tagName, "script") == 0 || strcasecmp(tagName, "title") == 0; + } + + static bool isAttrWhitespace(uint8_t c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } + + static bool isAttrNameChar(uint8_t c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || + c == ':' || c == '.'; + } + + void resetAnchorAttrScan() { + attrState = ATTR_FIND_NAME; + attrNameLen = 0; + currentAttrIsId = false; + } + + void finishCapturedAnchorId() { + capturedAnchorId[capturedAnchorIdLen] = '\0'; + capturingAnchorTag = false; + resetAnchorAttrScan(); + } + + void beginAnchorIdScan() { + capturingAnchorTag = true; + resetAnchorAttrScan(); + } + + void endAnchorIdScan() { + if (capturingAnchorTag) { + capturedAnchorIdLen = 0; + } + capturingAnchorTag = false; + resetAnchorAttrScan(); + } + + void appendCapturedAnchorId(uint8_t c) { + if (capturedAnchorIdLen + 1 < MAX_ANCHOR_ID) { + capturedAnchorId[capturedAnchorIdLen++] = c; + } + } + + void scanAnchorAttribute(uint8_t c) { + switch (attrState) { + case ATTR_FIND_NAME: + if (isAttrNameChar(c)) { + attrState = ATTR_READ_NAME; + attrNameLen = 1; + currentAttrIsId = c == 'i'; + } + break; + case ATTR_READ_NAME: + if (isAttrNameChar(c)) { + if (attrNameLen == 1) { + currentAttrIsId = currentAttrIsId && c == 'd'; + } else { + currentAttrIsId = false; + } + attrNameLen++; + } else { + currentAttrIsId = currentAttrIsId && attrNameLen == 2; + if (isAttrWhitespace(c)) { + attrState = ATTR_AFTER_NAME; + } else if (c == '=') { + attrState = ATTR_BEFORE_VALUE; + } else { + resetAnchorAttrScan(); + } + } + break; + case ATTR_AFTER_NAME: + if (isAttrWhitespace(c)) { + break; + } + if (c == '=') { + attrState = ATTR_BEFORE_VALUE; + } else if (isAttrNameChar(c)) { + attrState = ATTR_READ_NAME; + attrNameLen = 1; + currentAttrIsId = c == 'i'; + } else { + resetAnchorAttrScan(); + } + break; + case ATTR_BEFORE_VALUE: + if (isAttrWhitespace(c)) { + break; + } + if (currentAttrIsId && c == '"') { + capturedAnchorIdLen = 0; + attrState = ATTR_CAPTURE_D; + } else if (currentAttrIsId && c == '\'') { + capturedAnchorIdLen = 0; + attrState = ATTR_CAPTURE_S; + } else if (c == '"') { + attrState = ATTR_CAPTURE_D; + } else if (c == '\'') { + attrState = ATTR_CAPTURE_S; + } else { + resetAnchorAttrScan(); + } + break; + case ATTR_CAPTURE_D: + if (c == '"') { + if (currentAttrIsId) { + finishCapturedAnchorId(); + } else { + resetAnchorAttrScan(); + } + } else if (currentAttrIsId) { + appendCapturedAnchorId(c); + } + break; + case ATTR_CAPTURE_S: + if (c == '\'') { + if (currentAttrIsId) { + finishCapturedAnchorId(); + } else { + resetAnchorAttrScan(); + } + } else if (currentAttrIsId) { + appendCapturedAnchorId(c); + } + break; + } + } void onVisibleCodepoint() { totalVisChars++; @@ -286,15 +436,19 @@ class ParagraphStreamer final : public Print { void onOpenTag() { htmlDepth++; + if (nonVisibleDepth > 0 || isNonVisibleTag()) { + nonVisibleDepth++; + return; + } + if (stepCount == 0) { if (strcasecmp(tagName, "p") == 0) onLegacyP(); return; } - // Capture inside the fully-matched element even after target char is found + // Capture a child inside the fully-matched element even after target char is found. if (revPFound && matchedDepth == stepCount && capturedAnchorIdLen == 0 && strcasecmp(tagName, "a") == 0) { - capturingAnchorTag = true; - idState = ID_SCAN; + beginAnchorIdScan(); } if (revDone) return; @@ -315,6 +469,7 @@ class ParagraphStreamer final : public Print { stepEnteredAtDepth[matchedDepth] = htmlDepth; matchedDepth++; if (matchedDepth == stepCount) { + beginAnchorIdScan(); paragraphAtMatch = pCount; liCountAtMatch = liCount; revPFound = true; @@ -332,6 +487,12 @@ class ParagraphStreamer final : public Print { } void onCloseTag() { + if (nonVisibleDepth > 0) { + nonVisibleDepth--; + if (htmlDepth > 0) htmlDepth--; + return; + } + // Legacy mode: each direct child element closing advances the text node index. if (stepCount == 0 && revPFound && !revDone && paragraphHtmlDepth >= 0 && htmlDepth == paragraphHtmlDepth + 1) { currentTextNode++; @@ -419,42 +580,12 @@ class ParagraphStreamer final : public Print { attrQuoteChar = 0; } if (capturingAnchorTag) { - switch (idState) { - case ID_SCAN: - idState = (c == 'i' || c == 'I') ? ID_I : ID_SCAN; - break; - case ID_I: - idState = (c == 'd' || c == 'D') ? ID_D : ID_SCAN; - break; - case ID_D: - idState = (c == '=') ? ID_EQ : ID_SCAN; - break; - case ID_EQ: - if (c == '"') - idState = ID_IN_VALUE_D; - else if (c == '\'') - idState = ID_IN_VALUE_S; - break; - case ID_IN_VALUE_D: - if (c == '"') { - capturedAnchorId[capturedAnchorIdLen] = '\0'; - capturingAnchorTag = false; - } else if (capturedAnchorIdLen + 1 < MAX_ANCHOR_ID) - capturedAnchorId[capturedAnchorIdLen++] = c; - break; - case ID_IN_VALUE_S: - if (c == '\'') { - capturedAnchorId[capturedAnchorIdLen] = '\0'; - capturingAnchorTag = false; - } else if (capturedAnchorIdLen + 1 < MAX_ANCHOR_ID) - capturedAnchorId[capturedAnchorIdLen++] = c; - break; - } + scanAnchorAttribute(c); } // Only treat '/' as self-closing when outside a quoted attribute value. if (c == '/' && !inAttrQuote) { + endAnchorIdScan(); onCloseTag(); - capturingAnchorTag = false; } break; } @@ -512,10 +643,13 @@ class ParagraphStreamer final : public Print { tagNameLen = 0; tagIsClose = false; capturingAnchorTag = false; - idState = ID_SCAN; + resetAnchorAttrScan(); inAttrQuote = false; attrQuoteChar = 0; } else if (c == '>') { + if (tagState == TAG_ATTRS) { + endAnchorIdScan(); + } globalInTag = false; inAttrQuote = false; if (tagState == TAG_IN_NAME && tagNameLen > 0) { @@ -529,6 +663,9 @@ class ParagraphStreamer final : public Print { tagState = TAG_IDLE; } else if (globalInTag) { processByteInTag(c); + } else if (nonVisibleDepth > 0) { + // Ignore head/style/script/title text. KOReader XPaths are body-relative, and CSS text + // should not contribute to intra-spine progress. } else { if (c == '&') { globalInEntity = true; @@ -762,4 +899,4 @@ std::string ProgressMapper::generateXPath(const std::shared_ptr& epub, int const int p = s.paragraphCount(); return (p > 0) ? base + "/p[" + std::to_string(p) + "]" : base; -} \ No newline at end of file +}