From 61d29b898f1170597b2cf39b297858ebb902a3c4 Mon Sep 17 00:00:00 2001 From: Jan Max Meyer Date: Tue, 18 Sep 2018 10:33:02 +0200 Subject: [PATCH 1/2] Simple HTML parser --- htmltree/htmltree.py | 263 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 261 insertions(+), 2 deletions(-) diff --git a/htmltree/htmltree.py b/htmltree/htmltree.py index f5cd95a..7da03b4 100644 --- a/htmltree/htmltree.py +++ b/htmltree/htmltree.py @@ -1053,6 +1053,251 @@ def Textarea(*content, **attrs): """ return KWElement('textarea', *content, **attrs) +####################################################################### +## HTML parser +####################################################################### + +# Global variables +__tagsDict = None + +def __buildTagsDict(): + """ + Internal function for building a mapping of all available tags. + """ + global __tagsDict + + __tagsDict = {} + mod = __import__(__name__) + + for clname in dir(mod): + cl = getattr(mod, clname) + if clname[0] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ" or not callable(cl) or cl is HtmlElement or cl is KWElement: + continue + + try: + inst = cl() + if issubclass(inst.__class__, HtmlElement): + __tagsDict[cl.__name__.lower()] = cl + except: + pass + + #print(__tagsDict) + +def parseHtml(html): + """ + Parses the provided HTML code according to the objects defined in the htmltree-library. + """ + + def scanWhite(l): + """ + Scan and return whitespace. + """ + + ret = "" + while l and l[0] in " \t\r\n\v": + ret += l.pop(0) + + return ret + + def scanWord(l): + """ + Scan and return a word. + """ + + ret = "" + while l and l[0] not in " \t\r\n\v" + "<>=\"'": + ret += l.pop(0) + + return ret + + def buildElem(elem): + try: + elem = elem[0](*elem[3], **elem[2]) + except: + print(elem[2]) + elem = elem[0](**elem[2]) + + return elem + + global __tagsDict + + # Obtain tag descriptions + if __tagsDict is None: + __buildTagsDict() + + # Prepare stack and input + stack = [] + ret = [] + html = [ch for ch in html] + + # Parse + while html: + tag = None + text = "" + + # ugly... + singles = [] + while stack and stack[-1][1] in ["br", "input", "img", "meta"]: + singles.append(buildElem(stack.pop())) + + if stack: + stack[-1][3].extend(singles) + else: + ret.extend(singles) + + while html: + #print("html", html) + #print(stack) + + ch = html.pop(0) + + # Comment + if html and ch == "<" and "".join(html[:3]) == "!--": + html = html[3:] + while html and "".join(html[:3]) != "-->": + html.pop(0) + + html = html[3:] + + # Opening tag + elif html and ch == "<" and html[0] != "/": + # Append plain text (if not only whitespace) + if (text and ((len(text) == 1 and text in ["\t "]) + or not all([ch in " \t\r\n\v" for ch in text]))): + if stack: + stack[-1][3].append(text) + else: + ret.append(text) + + tag = scanWord(html) + if tag.lower() in __tagsDict: + break + + text += ch + tag + + # Closing tag + elif html and stack and stack[-1][1] and ch == "<" and html[0] == "/": + # Append plain text (if not only whitespace) + if (text and ((len(text) == 1 and text in ["\t "]) + or not all([ch in " \t\r\n\v" for ch in text]))): + if stack: + stack[-1][3].append(text) + else: + ret.append(text) + + junk = ch + junk += html.pop(0) + + tag = scanWord(html) + junk += tag + + # print("/", tag.lower(), stack[-1][1].lower()) + if stack[-1][1].lower() == tag.lower(): + junk += scanWhite(html) + if html and html[0] == ">": + html.pop(0) + + elem = buildElem(stack.pop()) + + if not stack: + ret.append(elem) + else: + stack[-1][3].append(elem) + + tag = None + break + + text += junk + tag = None + + else: + text += ch + + # Create tag + if tag and tag.lower() in __tagsDict: + #print(tag) + stack.append((__tagsDict[tag.lower()], tag.lower(), {}, [])) + + # print("tag", tag) + + while html: + scanWhite(html) + if not html: + break + + # End of tag > + if html[0] == ">": + html.pop(0) + break + + # Closing tag at end /> + elif html[0] == "/": + html.pop(0) + scanWhite(html) + + if html[0] == ">": + elem = buildElem(stack.pop()) + + if not stack: + ret.append(elem) + else: + stack[-1][3].append(elem) + + html.pop(0) + break + + att = scanWord(html).lower() + val = att + + if not att: + html.pop(0) + continue + + # Attribute + scanWhite(html) + if html[0] == "=": + html.pop(0) + scanWhite(html) + + if html[0] in "\"'": + ch = html.pop(0) + + val = "" + while html and html[0] != ch: + val += html.pop(0) + + html.pop(0) + + if att == "style": + if "style" not in stack[-1][2]: + stack[-1][2]["style"] = {} + + for dfn in val.split(";"): + if not ":" in dfn: + continue + + att, val = dfn.split(":", 1) + + # print(tag, "style", att.strip(), val.strip()) + stack[-1][2]["style"][att.strip()] = val.strip() + else: + stack[-1][2][att] = val + + continue + + # Unclosed tags? + while stack: + ret.append(buildElem(stack.pop())) + + if ret: + if len(ret) > 1: + return ret + + return ret[0] + + return None + + ####################################################################### ## Interactive Elememts (Experimental. Omitted for now.) ####################################################################### @@ -1071,6 +1316,20 @@ def Textarea(*content, **attrs): # __pragma__ ('skip') if __name__ == '__main__': - import doctest - doctest.testmod() + #import doctest + #doctest.testmod() + #print(parseHtml(open("test.html", "r").read()).render()) + print(parseHtml(""" + + + + + +

+ Hello, htmltree! +

+ + + """).render()) + # __pragma__ ('noskip') From 96c16edf9f18e3610387f1dfbee6bbf8d10ab1a4 Mon Sep 17 00:00:00 2001 From: Jan Max Meyer Date: Thu, 20 Sep 2018 13:49:30 +0200 Subject: [PATCH 2/2] Improved HTML parser to run buth with CPython and with Transcrypt --- htmltree/htmltree.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/htmltree/htmltree.py b/htmltree/htmltree.py index 7da03b4..c608be1 100644 --- a/htmltree/htmltree.py +++ b/htmltree/htmltree.py @@ -1,4 +1,10 @@ # -*- coding: utf-8 -*- + +# __pragma__('kwargs') +# __pragma__('xglobs') +# __pragma__('tconv') +# __pragma__('opov') + """ Description: Provides a general html tree class, HtmlElement and wrapper functions for most standard non-obsolete HTML tags. @@ -22,7 +28,7 @@ Copyright 2017 Ellis & Grant, Inc. License: MIT License """ -# __pragma__('kwargs') + def KWElement(tag, *content, **attrs): """ @@ -1065,23 +1071,24 @@ def __buildTagsDict(): Internal function for building a mapping of all available tags. """ global __tagsDict - __tagsDict = {} - mod = __import__(__name__) - for clname in dir(mod): - cl = getattr(mod, clname) - if clname[0] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ" or not callable(cl) or cl is HtmlElement or cl is KWElement: + for cname in globals().keys(): + if cname[0] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": continue + cl = globals()[cname] + try: inst = cl() - if issubclass(inst.__class__, HtmlElement): - __tagsDict[cl.__name__.lower()] = cl + + if isinstance(inst, HtmlElement): + __tagsDict[cname.lower()] = cl + except: pass - #print(__tagsDict) + # print(__tagsDict) def parseHtml(html): """ @@ -1246,7 +1253,8 @@ def buildElem(elem): html.pop(0) break - att = scanWord(html).lower() + att = scanWord(html) + att = att.lower() #fixme: This is split into two lines due to a Transcrypt bug val = att if not att: @@ -1306,7 +1314,6 @@ def buildElem(elem): ## Web Components (Experimental. Omitted for now.) ####################################################################### -# __pragma__('nokwargs') ## The 'skip' pragma tells the Transcrypt Python to JS transpiler to ## ignore a section of code. It's needed here because the 'run as script' @@ -1314,11 +1321,8 @@ def buildElem(elem): ## Putting the pragmas in comments means they'll be ignored and cause no ## problems in a real python interpreter. -# __pragma__ ('skip') + if __name__ == '__main__': - #import doctest - #doctest.testmod() - #print(parseHtml(open("test.html", "r").read()).render()) print(parseHtml(""" @@ -1331,5 +1335,3 @@ def buildElem(elem): """).render()) - -# __pragma__ ('noskip')