diff --git a/application/tests/cheatsheet_extractor_test.py b/application/tests/cheatsheet_extractor_test.py new file mode 100644 index 000000000..ea159257f --- /dev/null +++ b/application/tests/cheatsheet_extractor_test.py @@ -0,0 +1,169 @@ +import unittest +from application.utils.external_project_parsers.parsers.cheatsheet_extractor import ( + extract_cheatsheet_record, +) +from application.defs.cheatsheet_defs import SUMMARY_MAX_LENGTH + +SOURCE_PATH = "cheatsheets/Secrets_Management_Cheat_Sheet.md" +EXPECTED_SOURCE_ID = "Secrets_Management_Cheat_Sheet" +EXPECTED_HYPERLINK = ( + "https://cheatsheetseries.owasp.org/cheatsheets/Secrets_Management_Cheat_Sheet.html" +) + +NORMAL_MD = """\ +# Secrets Management Cheat Sheet + +## Introduction +Storage guidance. + +## Architectural Patterns +Use vaults and environment isolation. +""" + +MISSING_H1_MD = """\ +## Introduction +No H1 present. + +## Details +More content. +""" + +EMPTY_MD = "" + +# No ## Introduction heading exists, so summary extraction +# falls back to the first available body content. +BODY_UNDER_H1_MD = """\ +# Single Heading Cheat Sheet + +Body text directly under H1, no subheadings at all. +""" + +# Leading whitespace before H1 and malformed ## headings +# should still be normalized and extracted correctly. +MALFORMED_MD = """\ + # Malformed Title + +##malformed + +## Introduction +Some intro text. + +## Valid Heading +""" + + +class TestNormal(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(NORMAL_MD, SOURCE_PATH) + + # source-derived fields should remain deterministic and + # independent of markdown content across all extraction paths. + def test_source(self): + self.assertEqual(self.record.source, "owasp_cheatsheets") + + def test_source_id(self): + self.assertEqual(self.record.source_id, EXPECTED_SOURCE_ID) + + def test_hyperlink(self): + self.assertEqual(self.record.hyperlink, EXPECTED_HYPERLINK) + + def test_raw_markdown_path(self): + self.assertEqual(self.record.raw_markdown_path, SOURCE_PATH) + + def test_title(self): + self.assertEqual(self.record.title, "Secrets Management Cheat Sheet") + + def test_summary(self): + self.assertEqual(self.record.summary, "Storage guidance.") + + def test_summary_bounded(self): + # Summary truncation is enforced centrally via + # CheatsheetRecord.__post_init__. + self.assertLessEqual(len(self.record.summary), SUMMARY_MAX_LENGTH) + + def test_headings(self): + self.assertIn("Introduction", self.record.headings) + self.assertIn("Architectural Patterns", self.record.headings) + + def test_fallback_not_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "false") + + +class TestMissingH1(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(MISSING_H1_MD, SOURCE_PATH) + + def test_title_is_fallback(self): + self.assertEqual(self.record.title, "No title found.") + + def test_summary_from_introduction(self): + self.assertIn("no h1", self.record.summary.lower()) + + def test_headings_extracted(self): + self.assertIn("Introduction", self.record.headings) + self.assertIn("Details", self.record.headings) + + def test_fallback_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "true") + + +class TestEmptyMarkdown(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(EMPTY_MD, SOURCE_PATH) + + def test_title_is_fallback(self): + self.assertEqual(self.record.title, "No title found.") + + def test_summary_no_summary_found(self): + # Empty markdown should trigger terminal summary fallback. + self.assertEqual(self.record.summary, "No summary found.") + + def test_headings_empty(self): + self.assertEqual(self.record.headings, []) + + def test_fallback_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "true") + + +class TestBodyUnderH1(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(BODY_UNDER_H1_MD, SOURCE_PATH) + + def test_title(self): + self.assertEqual(self.record.title, "Single Heading Cheat Sheet") + + def test_summary_from_fallback_via_h1(self): + # Summary fallback should extract body content beneath the H1 section. + self.assertIn("body text", self.record.summary.lower()) + + def test_headings_empty(self): + # No valid ## headings should produce an empty headings list. + self.assertEqual(self.record.headings, []) + + def test_fallback_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "true") + + +class TestMalformedHeadings(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(MALFORMED_MD, SOURCE_PATH) + + def test_malformed_h1_extracted(self): + self.assertEqual(self.record.title, "Malformed Title") + + def test_malformed_h2_in_headings(self): + self.assertIn("malformed", self.record.headings) + + def test_valid_headings_also_extracted(self): + self.assertIn("Introduction", self.record.headings) + self.assertIn("Valid Heading", self.record.headings) + + def test_summary_from_introduction(self): + self.assertIn("intro", self.record.summary.lower()) + + def test_fallback_not_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "false") + + +if __name__ == "__main__": + unittest.main() diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 384afe932..f6e555207 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -1,16 +1,27 @@ +import logging import os import re from application.defs.cheatsheet_defs import CheatsheetRecord PARSER_VERSION = "v1" -FALLBACK_USED = "false" CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/" -_TITLE_RE = re.compile(r"^#\s+(?P.+)$", re.MULTILINE) -_HEADING_RE = re.compile(r"^##\s+(?P<heading>.+)$", re.MULTILINE) -_ANY_HEADING_RE = re.compile(r"^#{1,6}\s+.+$", re.MULTILINE) +_TITLE_RE = re.compile( + r"^\s*#(?!#)\s*(?P<title>.+?)$", + re.MULTILINE, +) + +_HEADING_RE = re.compile( + r"^\s*##(?!#)\s*(?P<heading>.+?)$", + re.MULTILINE, +) + +_ANY_HEADING_RE = re.compile( + r"^\s*#{1,6}(?!#)\s*.+?$", + re.MULTILINE, +) def _derive_source_id(source_path: str) -> str: @@ -41,28 +52,44 @@ def _extract_body_after_heading(markdown: str, heading_match: re.Match) -> str: def _extract_summary(markdown: str) -> str: - """Extract a summary section from cheatsheet markdown.""" - - all_heading_matches = list(_ANY_HEADING_RE.finditer(markdown)) + """Extract summary from Introduction section in cheatsheet markdown.""" - for match in all_heading_matches: - heading_text = match.group().lstrip("#").strip() - - if heading_text.lower() == "introduction": + for match in _ANY_HEADING_RE.finditer(markdown): + if match.group().strip().lstrip("#").strip().lower() == "introduction": body = _extract_body_after_heading(markdown, match) - if body: return body - break + raise ValueError( + "_extract_summary: no suitable summary section could be extracted from markdown." + ) - for match in all_heading_matches: - body = _extract_body_after_heading(markdown, match) +def _extract_title(markdown: str) -> str: + """Extract H1 title from cheatsheet markdown.""" + + match = _TITLE_RE.search(markdown) + if not match: + raise ValueError("_extract_title: no title found in markdown.") + + return match.group("title").strip() + + +def _fallback_title() -> str: + """Return fallback title for malformed markdown.""" + + return "No title found." + + +def _fallback_summary(markdown: str) -> str: + """Return first non-empty paragraph after any heading, or 'No summary found.'""" + + for match in _ANY_HEADING_RE.finditer(markdown): + body = _extract_body_after_heading(markdown, match) if body: return body - raise ValueError("_extract_summary: no summary could be extracted from markdown.") + return "No summary found." def extract_cheatsheet_record( @@ -71,12 +98,24 @@ def extract_cheatsheet_record( ) -> CheatsheetRecord: """Extract a structured CheatsheetRecord from markdown content.""" - title_match = _TITLE_RE.search(markdown) - title = title_match.group("title").strip() + fallback_used = "false" + + try: + title = _extract_title(markdown) + except ValueError as e: + logging.warning(str(e)) + title = _fallback_title() + fallback_used = "true" + # Headings can be empty. headings = [m.group("heading").strip() for m in _HEADING_RE.finditer(markdown)] - summary = _extract_summary(markdown) + try: + summary = _extract_summary(markdown) + except ValueError as e: + logging.warning(str(e)) + summary = _fallback_summary(markdown) + fallback_used = "true" source_id = _derive_source_id(source_path) hyperlink = _derive_hyperlink(source_path) @@ -91,6 +130,6 @@ def extract_cheatsheet_record( category_hints=[], metadata={ "parser_version": PARSER_VERSION, - "fallback_used": FALLBACK_USED, + "fallback_used": fallback_used, }, ) diff --git a/docs/rfc-structured-extraction.md b/docs/rfc-structured-extraction.md new file mode 100644 index 000000000..06296f5ec --- /dev/null +++ b/docs/rfc-structured-extraction.md @@ -0,0 +1,256 @@ +# RFC Workstream B — Structured Extraction + +This document explains the implementation and behavior of RFC Workstream B +(Structured Extraction) from Cheatsheet to CRE Mapping RFC. + +The goal of this module is to convert OWASP Cheat Sheet markdown into a +deterministic structured object that downstream RFC workstreams can consume +for categorization, retrieval, reranking, and mapping generation. + +The implementation is primarily located in: + +* `cheatsheet_defs.py` +* `cheatsheet_extractor.py` + +--- + +## Sources for more context + +* RFC: + `docs/rfc/cheatsheets-llm-autonomous-mapping-rfc.md` + +* Checkpoints B1 & B2 implementation PR: + `https://github.com/OWASP/OpenCRE/pull/912` + +* Checkpoints B3 & B4 implementation PR: + `https://github.com/OWASP/OpenCRE/pull/921` + +--- + +## What Workstream B implements + +**The implementation strictly follows the RFC extraction contract and prioritizes deterministic extraction behavior.** + +It defines a typed dataclass named `CheatsheetRecord`. + +This object represents the structured extraction result returned from: + +```python +extract_cheatsheet_record(markdown, source_path) +``` + +The extractor parses OWASP Cheat Sheet markdown and returns normalized +structured information about a cheatsheet. + +`CheatsheetRecord` contains: + +* `source` +* `source_id` +* `title` +* `hyperlink` +* `summary` +* `headings` +* `raw_markdown_path` +* `category_hints` +* `metadata` + +--- + +## Fallback behavior + +The extractor contains fallback functions capable of handling incomplete or +malformed markdown containing: + +* missing titles, +* missing summary sources, +* malformed headings. + +These fallback paths ensure that extraction still returns a valid +`CheatsheetRecord` object instead of failing entirely. + +Fallback behavior is explicitly surfaced through: + +```json +"metadata": { + "fallback_used": "true" +} +``` + +This allows downstream workstreams to identify records that required fallback +logic during extraction and downstream normalization. + +--- + +## Fallback decision tree + +```text +extract_cheatsheet_record(markdown, source_path) + +│ +├── _extract_title(markdown) +│ ├── H1 title exists +│ │ → extract and normalize title +│ │ +│ └── H1 title missing +│ → _fallback_title() +│ → "No title found." +│ → metadata["fallback_used"] = "true" +│ +└── _extract_summary(markdown) + ├── "Introduction" heading exists with body content + │ → body beneath "Introduction" extracted as summary + │ → summary normalized and truncated upto specifc length. + │ + └── Introduction section missing or invalid + → _fallback_summary(markdown) + │ + ├── first heading with body content exists + │ → its body returned as summary + │ + └── no usable heading/body content exists + → "No summary found." + → metadata["fallback_used"] = "true" +``` + +--- + +## Extraction examples + +The following examples demonstrate deterministic extractor behavior across +different markdown shapes. + +Notes: + +* Currently, `category_hints` is intentionally returned as an initial empty + list during v1. + +* `raw_markdown_path`, `hyperlink`, and `source_id` are derived from + `source_path` (Module A) and are independent of markdown content. + +--- + +## 1. Normal cheat sheet + +### Example Input + +```markdown +# Secrets Management Cheat Sheet + +## Introduction +Storage guidance. + +## Architectural Patterns +Use vaults and environment isolation. +``` + +### Output + +```json +{ + "source": "owasp_cheatsheets", + "source_id": "Secrets_Management_Cheat_Sheet", + "title": "Secrets Management Cheat Sheet", + "hyperlink": "https://cheatsheetseries.owasp.org/cheatsheets/Secrets_Management_Cheat_Sheet.html", + "summary": "Storage guidance.", + "headings": ["Introduction", "Architectural Patterns"], + "raw_markdown_path": "cheatsheets/Secrets_Management_Cheat_Sheet.md", + "category_hints": [], + "metadata": { + "parser_version": "v1", + "fallback_used": "false" + } +} +``` + +### Notes + +* No fallback logic was required. + +--- + +## 2. Missing H1 (fallback title) + +### Input + +```markdown +## Introduction +No H1 present. + +## Details +More content. +``` + +### Output + +```json +{ + "source": "owasp_cheatsheets", + "source_id": "Example_Cheat_Sheet", + "title": "No title found.", + "hyperlink": "https://cheatsheetseries.owasp.org/cheatsheets/Example_Cheat_Sheet.html", + "summary": "No H1 present.", + "headings": ["Introduction", "Details"], + "raw_markdown_path": "cheatsheets/Example_Cheat_Sheet.md", + "category_hints": [], + "metadata": { + "parser_version": "v1", + "fallback_used": "true" + } +} +``` + +### Notes + +* No H1 title exists, so the title defaults to `"No title found."` + +--- + +## 3. Missing Introduction section (summary fallback) + +### Input + +```markdown +# Single Heading Cheat Sheet + +## Authentication + +### Storage +Secrets should be encrypted. +``` + +### Output + +```json +{ + "source": "owasp_cheatsheets", + "source_id": "Single_Heading_Cheat_Sheet", + "title": "Single Heading Cheat Sheet", + "hyperlink": "https://cheatsheetseries.owasp.org/cheatsheets/Single_Heading_Cheat_Sheet.html", + "summary": "Secrets should be encrypted.", + "headings": ["Authentication"], + "raw_markdown_path": "cheatsheets/Single_Heading_Cheat_Sheet.md", + "category_hints": [], + "metadata": { + "parser_version": "v1", + "fallback_used": "true" + } +} +``` + +### Notes + +* No `Introduction` heading exists, so summary fallback logic is used. +* The fallback scans all headings and returns the first non-empty body it + finds — in this case the content beneath `### Storage`. +* Only `##`-level headings appear in `headings` — `### Storage` is excluded. + +--- + +## Additional behavior notes + +The extractor correctly handles markdown files with malformed titles/headings such as: + +* Titles with leading whitespace +* No space after the marker (e.g. `##Authentication`) + +---