Stage 295: PR #1637 — protect raw pre from glued-bold lift (closes #1451) by @Michaelyklam

This commit is contained in:
test
2026-05-04 18:26:20 +00:00
4 changed files with 23 additions and 10 deletions
Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

+3 -1
View File
@@ -1665,7 +1665,6 @@ function renderMd(raw){
s=s.replace(/<i>([\s\S]*?)<\/i>/gi,(_,t)=>'*'+t+'*');
s=s.replace(/<code>([^<]*?)<\/code>/gi,(_,t)=>'`'+t+'`');
s=s.replace(/<br\s*\/?>/gi,'\n');
s=s.replace(/\x00R(\d+)\x00/g,(_,i)=>rawPreStash[+i]);
// ── Glued-bold-heading lift (issue #1446) ────────────────────────────────
// LLMs in thinking/reasoning mode frequently emit a "section header" glued
// to the end of the previous paragraph with no whitespace, like:
@@ -1797,6 +1796,9 @@ function renderMd(raw){
s=s.replace(/(<a\b[^>]*>[\s\S]*?<\/a>)/g,m=>{_a_stash.push(m);return `\x00A${_a_stash.length-1}\x00`;});
s=s.replace(/\[([^\]]+)\]\((https?:\/\/[^\)]+)\)/g,(_,label,url)=>`<a href="${url.replace(/"/g,'%22')}" target="_blank" rel="noopener">${esc(label)}</a>`);
s=s.replace(/\x00A(\d+)\x00/g,(_,i)=>_a_stash[+i]);
// Restore raw <pre> only after markdown rewrites so literal preformatted
// content stays placeholder-protected, then let the sanitizer normalize tags.
s=s.replace(/\x00R(\d+)\x00/g,(_,i)=>rawPreStash[+i]);
// Sanitize any remaining HTML tags. The renderer intentionally returns
// HTML and inserts it with innerHTML later, so tag names alone are not enough:
// raw/model-provided HTML like <img onerror=...> or <a href="javascript:...">
+17 -6
View File
@@ -153,20 +153,21 @@ def test_chain_of_glued_headings_all_lifted():
def test_lift_pass_present_in_ui_js_at_correct_position():
"""The lift regex must be present in ui.js, between rawPreStash restore and fence_stash restore.
"""The lift regex must be present in ui.js before protected-code restores.
This pins the position so a future cleanup can't accidentally move the lift
to a place where it would corrupt fenced code blocks (which are stashed as
\\x00P / \\x00F tokens at this point and don't match the lift regex).
to a place where it would corrupt raw <pre> HTML or fenced code blocks
(which are stashed as \x00R / \x00P / \x00F tokens at this point and don't
match the lift regex).
"""
lift_idx = UI_JS.find(r'(/([.!?])\*\*([^*\n]{1,80})\*\*\n\n/g')
assert lift_idx > 0, "Glued-bold-heading lift regex not found in static/ui.js"
raw_pre_restore = UI_JS.find("rawPreStash[+i]")
fence_restore = UI_JS.find("fence_stash[+i]")
assert raw_pre_restore > 0 and fence_restore > 0, "stash restore landmarks missing"
assert raw_pre_restore < lift_idx < fence_restore, (
"Glued-bold lift must sit between rawPreStash restore and fence_stash restore "
"so fenced code is protected. Current ordering broken."
assert lift_idx < raw_pre_restore and lift_idx < fence_restore, (
"Glued-bold lift must run before rawPreStash and fence_stash restore "
"so raw <pre> and fenced code are protected. Current ordering broken."
)
@@ -254,6 +255,16 @@ def test_real_renderer_protects_fenced_code(driver_path):
assert "**inside-code**" in out, out
@pytest.mark.skipif(NODE is None, reason="node not on PATH")
def test_real_renderer_protects_raw_pre_html(driver_path):
"""Raw literal <pre> content must stay byte-preserved when it contains the glued trigger."""
src = "<pre>Para text.**Heading**\n\nNext.</pre>\n"
out = _render(driver_path, src)
assert "<pre>Para text.**Heading**\n\nNext.</pre>" in out, out
assert "<pre>Para text.\n\n**Heading**\n\nNext.</pre>" not in out, out
assert "<strong>Heading</strong>" not in out, out
@pytest.mark.skipif(NODE is None, reason="node not on PATH")
def test_real_renderer_protects_inline_code(driver_path):
"""Glued pattern inside inline backticks must stay literal."""
+3 -3
View File
@@ -69,9 +69,9 @@ def render_md(raw):
s = re.sub(r"<i>([\s\S]*?)</i>", lambda m: "*" + m.group(1) + "*", s, flags=re.I)
s = re.sub(r"<code>([^<]*?)</code>", lambda m: "`" + m.group(1) + "`", s, flags=re.I)
s = re.sub(r"<br\s*/?>", "\n", s, flags=re.I)
# Glued-bold-heading lift (issue #1446) — must mirror static/ui.js position:
# after raw <pre> restore, before fence_stash restore. Lifts a sentence-glued
# bold "stub heading" out into its own paragraph when followed by a blank line.
# Glued-bold-heading lift (issue #1446) — must mirror static/ui.js behavior:
# protected code/pre placeholders stay hidden while a sentence-glued bold
# "stub heading" is lifted into its own paragraph when followed by a blank line.
s = re.sub(r"([.!?])\*\*([^*\n]{1,80})\*\*\n\n", r"\1\n\n**\2**\n\n", s)
s = re.sub(r"\x00F(\d+)\x00", lambda m: fence_stash[int(m.group(1))], s)