Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
231 changes: 184 additions & 47 deletions lib/KOReaderSync/ProgressMapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ int parseIndex(const std::string& xpath, const char* prefix, bool last = false)

int parseCharOffset(const std::string& xpath) {
const size_t textPos = xpath.rfind("text()");
if (textPos == std::string::npos) return 0;
const size_t dotPos = xpath.find('.', textPos);
const size_t dotPos = (textPos != std::string::npos) ? xpath.find('.', textPos) : xpath.rfind('.');
if (dotPos == std::string::npos || dotPos + 1 >= xpath.size()) return 0;
int val = 0;
for (size_t i = dotPos + 1; i < xpath.size(); i++) {
Expand Down Expand Up @@ -104,7 +103,13 @@ bool isChapterStartXPath(const std::string& xpath) {
if (dotPos == std::string::npos || dotPos <= bodyContentStart || dotPos + 1 >= xpath.size()) {
return false;
}
if (xpath.find('/', bodyContentStart) != std::string::npos) {
size_t terminalEnd = dotPos;
static constexpr char kTextNode[] = "/text()";
const size_t textNodePos = xpath.rfind(kTextNode, dotPos);
if (textNodePos != std::string::npos && textNodePos >= bodyContentStart) {
terminalEnd = textNodePos;
}
if (xpath.find('/', bodyContentStart) < terminalEnd) {
return false;
}

Expand All @@ -122,7 +127,7 @@ struct XPathStep {

static constexpr int MAX_XPATH_DEPTH = 16;

// Parse the XPath segment between /body/DocFragment[N]/body/ and text()[N].offset
// Parse the XPath segment between /body/DocFragment[N]/body/ and the terminal position
// into an ordered sequence of steps. Returns step count, 0 on failure.
// Example input: "/body/DocFragment[1]/body/div[1]/ul/li[4]/text()[1].51"
// Fills steps with: {div,1}, {ul,1}, {li,4}
Expand All @@ -136,13 +141,20 @@ int parseXPathSteps(const std::string& xpath, XPathStep steps[MAX_XPATH_DEPTH])
if (xpath.compare(afterBracket + 1, strlen(kBody), kBody) != 0) return 0;
size_t pos = afterBracket + 1 + strlen(kBody);

const size_t textPos = xpath.rfind("/text()");
if (textPos == std::string::npos || textPos <= pos) return 0;
size_t stepsEnd = xpath.rfind("/text()");
if (stepsEnd == std::string::npos) {
stepsEnd = xpath.rfind('.');
if (stepsEnd == std::string::npos || stepsEnd <= pos || stepsEnd + 1 >= xpath.size()) return 0;
for (size_t i = stepsEnd + 1; i < xpath.size(); i++) {
if (xpath[i] < '0' || xpath[i] > '9') return 0;
}
}
if (stepsEnd <= pos) return 0;

int count = 0;
while (pos < textPos && count < MAX_XPATH_DEPTH) {
while (pos < stepsEnd && count < MAX_XPATH_DEPTH) {
const size_t slash = xpath.find('/', pos);
const size_t segEnd = (slash < textPos) ? slash : textPos;
const size_t segEnd = (slash < stepsEnd) ? slash : stepsEnd;

XPathStep& step = steps[count];
const size_t bracket = xpath.find('[', pos);
Expand All @@ -166,7 +178,7 @@ int parseXPathSteps(const std::string& xpath, XPathStep steps[MAX_XPATH_DEPTH])
}

count++;
pos = (slash < textPos) ? slash + 1 : textPos;
pos = (slash < stepsEnd) ? slash + 1 : stepsEnd;
}
return count;
}
Expand Down Expand Up @@ -223,10 +235,148 @@ class ParagraphStreamer final : public Print {
char capturedAnchorId[MAX_ANCHOR_ID] = {};
int capturedAnchorIdLen = 0;
bool capturingAnchorTag = false;
enum IdScanState { ID_SCAN, ID_I, ID_D, ID_EQ, ID_IN_VALUE_D, ID_IN_VALUE_S } idState = ID_SCAN;
enum AnchorAttrState {
ATTR_FIND_NAME,
ATTR_READ_NAME,
ATTR_AFTER_NAME,
ATTR_BEFORE_VALUE,
ATTR_CAPTURE_D,
ATTR_CAPTURE_S
} attrState = ATTR_FIND_NAME;
uint8_t attrNameLen = 0;
bool currentAttrIsId = false;
bool inAttrQuote =
false; // true while inside a quoted attribute value (prevents '/' from being treated as self-close)
char attrQuoteChar = 0;
uint8_t nonVisibleDepth = 0;

bool isNonVisibleTag() const {
return strcasecmp(tagName, "head") == 0 || strcasecmp(tagName, "style") == 0 ||
strcasecmp(tagName, "script") == 0 || strcasecmp(tagName, "title") == 0;
}

static bool isAttrWhitespace(uint8_t c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; }

static bool isAttrNameChar(uint8_t c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' ||
c == ':' || c == '.';
}

void resetAnchorAttrScan() {
attrState = ATTR_FIND_NAME;
attrNameLen = 0;
currentAttrIsId = false;
}

void finishCapturedAnchorId() {
capturedAnchorId[capturedAnchorIdLen] = '\0';
capturingAnchorTag = false;
resetAnchorAttrScan();
}

void beginAnchorIdScan() {
capturingAnchorTag = true;
resetAnchorAttrScan();
}

void endAnchorIdScan() {
if (capturingAnchorTag) {
capturedAnchorIdLen = 0;
}
capturingAnchorTag = false;
resetAnchorAttrScan();
}

void appendCapturedAnchorId(uint8_t c) {
if (capturedAnchorIdLen + 1 < MAX_ANCHOR_ID) {
capturedAnchorId[capturedAnchorIdLen++] = c;
}
}

void scanAnchorAttribute(uint8_t c) {
switch (attrState) {
case ATTR_FIND_NAME:
if (isAttrNameChar(c)) {
attrState = ATTR_READ_NAME;
attrNameLen = 1;
currentAttrIsId = c == 'i';
}
break;
case ATTR_READ_NAME:
if (isAttrNameChar(c)) {
if (attrNameLen == 1) {
currentAttrIsId = currentAttrIsId && c == 'd';
} else {
currentAttrIsId = false;
}
attrNameLen++;
} else {
currentAttrIsId = currentAttrIsId && attrNameLen == 2;
if (isAttrWhitespace(c)) {
attrState = ATTR_AFTER_NAME;
} else if (c == '=') {
attrState = ATTR_BEFORE_VALUE;
} else {
resetAnchorAttrScan();
}
}
break;
case ATTR_AFTER_NAME:
if (isAttrWhitespace(c)) {
break;
}
if (c == '=') {
attrState = ATTR_BEFORE_VALUE;
} else if (isAttrNameChar(c)) {
attrState = ATTR_READ_NAME;
attrNameLen = 1;
currentAttrIsId = c == 'i';
} else {
resetAnchorAttrScan();
}
break;
case ATTR_BEFORE_VALUE:
if (isAttrWhitespace(c)) {
break;
}
if (currentAttrIsId && c == '"') {
capturedAnchorIdLen = 0;
attrState = ATTR_CAPTURE_D;
} else if (currentAttrIsId && c == '\'') {
capturedAnchorIdLen = 0;
attrState = ATTR_CAPTURE_S;
} else if (c == '"') {
attrState = ATTR_CAPTURE_D;
} else if (c == '\'') {
attrState = ATTR_CAPTURE_S;
} else {
resetAnchorAttrScan();
}
break;
case ATTR_CAPTURE_D:
if (c == '"') {
if (currentAttrIsId) {
finishCapturedAnchorId();
} else {
resetAnchorAttrScan();
}
} else if (currentAttrIsId) {
appendCapturedAnchorId(c);
}
break;
case ATTR_CAPTURE_S:
if (c == '\'') {
if (currentAttrIsId) {
finishCapturedAnchorId();
} else {
resetAnchorAttrScan();
}
} else if (currentAttrIsId) {
appendCapturedAnchorId(c);
}
break;
}
}

void onVisibleCodepoint() {
totalVisChars++;
Expand Down Expand Up @@ -286,15 +436,19 @@ class ParagraphStreamer final : public Print {
void onOpenTag() {
htmlDepth++;

if (nonVisibleDepth > 0 || isNonVisibleTag()) {
nonVisibleDepth++;
return;
}

if (stepCount == 0) {
if (strcasecmp(tagName, "p") == 0) onLegacyP();
return;
}

// Capture <a id> inside the fully-matched element even after target char is found
// Capture a child <a id> inside the fully-matched element even after target char is found.
if (revPFound && matchedDepth == stepCount && capturedAnchorIdLen == 0 && strcasecmp(tagName, "a") == 0) {
capturingAnchorTag = true;
idState = ID_SCAN;
beginAnchorIdScan();
}

if (revDone) return;
Expand All @@ -315,6 +469,7 @@ class ParagraphStreamer final : public Print {
stepEnteredAtDepth[matchedDepth] = htmlDepth;
matchedDepth++;
if (matchedDepth == stepCount) {
beginAnchorIdScan();
paragraphAtMatch = pCount;
liCountAtMatch = liCount;
revPFound = true;
Expand All @@ -332,6 +487,12 @@ class ParagraphStreamer final : public Print {
}

void onCloseTag() {
if (nonVisibleDepth > 0) {
nonVisibleDepth--;
if (htmlDepth > 0) htmlDepth--;
return;
}

// Legacy mode: each direct child element closing advances the text node index.
if (stepCount == 0 && revPFound && !revDone && paragraphHtmlDepth >= 0 && htmlDepth == paragraphHtmlDepth + 1) {
currentTextNode++;
Expand Down Expand Up @@ -419,42 +580,12 @@ class ParagraphStreamer final : public Print {
attrQuoteChar = 0;
}
if (capturingAnchorTag) {
switch (idState) {
case ID_SCAN:
idState = (c == 'i' || c == 'I') ? ID_I : ID_SCAN;
break;
case ID_I:
idState = (c == 'd' || c == 'D') ? ID_D : ID_SCAN;
break;
case ID_D:
idState = (c == '=') ? ID_EQ : ID_SCAN;
break;
case ID_EQ:
if (c == '"')
idState = ID_IN_VALUE_D;
else if (c == '\'')
idState = ID_IN_VALUE_S;
break;
case ID_IN_VALUE_D:
if (c == '"') {
capturedAnchorId[capturedAnchorIdLen] = '\0';
capturingAnchorTag = false;
} else if (capturedAnchorIdLen + 1 < MAX_ANCHOR_ID)
capturedAnchorId[capturedAnchorIdLen++] = c;
break;
case ID_IN_VALUE_S:
if (c == '\'') {
capturedAnchorId[capturedAnchorIdLen] = '\0';
capturingAnchorTag = false;
} else if (capturedAnchorIdLen + 1 < MAX_ANCHOR_ID)
capturedAnchorId[capturedAnchorIdLen++] = c;
break;
}
scanAnchorAttribute(c);
}
// Only treat '/' as self-closing when outside a quoted attribute value.
if (c == '/' && !inAttrQuote) {
endAnchorIdScan();
onCloseTag();
capturingAnchorTag = false;
}
break;
}
Expand Down Expand Up @@ -512,10 +643,13 @@ class ParagraphStreamer final : public Print {
tagNameLen = 0;
tagIsClose = false;
capturingAnchorTag = false;
idState = ID_SCAN;
resetAnchorAttrScan();
inAttrQuote = false;
attrQuoteChar = 0;
} else if (c == '>') {
if (tagState == TAG_ATTRS) {
endAnchorIdScan();
}
globalInTag = false;
inAttrQuote = false;
if (tagState == TAG_IN_NAME && tagNameLen > 0) {
Expand All @@ -529,6 +663,9 @@ class ParagraphStreamer final : public Print {
tagState = TAG_IDLE;
} else if (globalInTag) {
processByteInTag(c);
} else if (nonVisibleDepth > 0) {
// Ignore head/style/script/title text. KOReader XPaths are body-relative, and CSS text
// should not contribute to intra-spine progress.
Comment on lines +666 to +668

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Stop parsing script text as markup

When a script or style block contains a literal < (for example if (a < b)), this ignore branch is never reached because < is handled earlier as a tag opener. The later real </script>/</style> can then be swallowed as attributes of that bogus tag, leaving nonVisibleDepth nonzero and causing all following body text to be skipped, so KOReader progress mapping falls back or resolves near the start for EPUBs with inline scripts/styles. While inside these non-visible raw-text elements, bytes need to be ignored until the actual closing tag rather than fed through the normal tag parser.

Useful? React with 👍 / 👎.

} else {
if (c == '&') {
globalInEntity = true;
Expand Down Expand Up @@ -762,4 +899,4 @@ std::string ProgressMapper::generateXPath(const std::shared_ptr<Epub>& epub, int

const int p = s.paragraphCount();
return (p > 0) ? base + "/p[" + std::to_string(p) + "]" : base;
}
}
Loading