diff --git a/htmltree/htmltree.py b/htmltree/htmltree.py index f5cd95a..c608be1 100644 --- a/htmltree/htmltree.py +++ b/htmltree/htmltree.py @@ -1,4 +1,10 @@ # -*- coding: utf-8 -*- + +# __pragma__('kwargs') +# __pragma__('xglobs') +# __pragma__('tconv') +# __pragma__('opov') + """ Description: Provides a general html tree class, HtmlElement and wrapper functions for most standard non-obsolete HTML tags. @@ -22,7 +28,7 @@ Copyright 2017 Ellis & Grant, Inc. License: MIT License """ -# __pragma__('kwargs') + def KWElement(tag, *content, **attrs): """ @@ -1053,6 +1059,253 @@ def Textarea(*content, **attrs): """ return KWElement('textarea', *content, **attrs) +####################################################################### +## HTML parser +####################################################################### + +# Global variables +__tagsDict = None + +def __buildTagsDict(): + """ + Internal function for building a mapping of all available tags. + """ + global __tagsDict + __tagsDict = {} + + for cname in globals().keys(): + if cname[0] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": + continue + + cl = globals()[cname] + + try: + inst = cl() + + if isinstance(inst, HtmlElement): + __tagsDict[cname.lower()] = cl + + except: + pass + + # print(__tagsDict) + +def parseHtml(html): + """ + Parses the provided HTML code according to the objects defined in the htmltree-library. + """ + + def scanWhite(l): + """ + Scan and return whitespace. + """ + + ret = "" + while l and l[0] in " \t\r\n\v": + ret += l.pop(0) + + return ret + + def scanWord(l): + """ + Scan and return a word. + """ + + ret = "" + while l and l[0] not in " \t\r\n\v" + "<>=\"'": + ret += l.pop(0) + + return ret + + def buildElem(elem): + try: + elem = elem[0](*elem[3], **elem[2]) + except: + print(elem[2]) + elem = elem[0](**elem[2]) + + return elem + + global __tagsDict + + # Obtain tag descriptions + if __tagsDict is None: + __buildTagsDict() + + # Prepare stack and input + stack = [] + ret = [] + html = [ch for ch in html] + + # Parse + while html: + tag = None + text = "" + + # ugly... + singles = [] + while stack and stack[-1][1] in ["br", "input", "img", "meta"]: + singles.append(buildElem(stack.pop())) + + if stack: + stack[-1][3].extend(singles) + else: + ret.extend(singles) + + while html: + #print("html", html) + #print(stack) + + ch = html.pop(0) + + # Comment + if html and ch == "<" and "".join(html[:3]) == "!--": + html = html[3:] + while html and "".join(html[:3]) != "-->": + html.pop(0) + + html = html[3:] + + # Opening tag + elif html and ch == "<" and html[0] != "/": + # Append plain text (if not only whitespace) + if (text and ((len(text) == 1 and text in ["\t "]) + or not all([ch in " \t\r\n\v" for ch in text]))): + if stack: + stack[-1][3].append(text) + else: + ret.append(text) + + tag = scanWord(html) + if tag.lower() in __tagsDict: + break + + text += ch + tag + + # Closing tag + elif html and stack and stack[-1][1] and ch == "<" and html[0] == "/": + # Append plain text (if not only whitespace) + if (text and ((len(text) == 1 and text in ["\t "]) + or not all([ch in " \t\r\n\v" for ch in text]))): + if stack: + stack[-1][3].append(text) + else: + ret.append(text) + + junk = ch + junk += html.pop(0) + + tag = scanWord(html) + junk += tag + + # print("/", tag.lower(), stack[-1][1].lower()) + if stack[-1][1].lower() == tag.lower(): + junk += scanWhite(html) + if html and html[0] == ">": + html.pop(0) + + elem = buildElem(stack.pop()) + + if not stack: + ret.append(elem) + else: + stack[-1][3].append(elem) + + tag = None + break + + text += junk + tag = None + + else: + text += ch + + # Create tag + if tag and tag.lower() in __tagsDict: + #print(tag) + stack.append((__tagsDict[tag.lower()], tag.lower(), {}, [])) + + # print("tag", tag) + + while html: + scanWhite(html) + if not html: + break + + # End of tag > + if html[0] == ">": + html.pop(0) + break + + # Closing tag at end /> + elif html[0] == "/": + html.pop(0) + scanWhite(html) + + if html[0] == ">": + elem = buildElem(stack.pop()) + + if not stack: + ret.append(elem) + else: + stack[-1][3].append(elem) + + html.pop(0) + break + + att = scanWord(html) + att = att.lower() #fixme: This is split into two lines due to a Transcrypt bug + val = att + + if not att: + html.pop(0) + continue + + # Attribute + scanWhite(html) + if html[0] == "=": + html.pop(0) + scanWhite(html) + + if html[0] in "\"'": + ch = html.pop(0) + + val = "" + while html and html[0] != ch: + val += html.pop(0) + + html.pop(0) + + if att == "style": + if "style" not in stack[-1][2]: + stack[-1][2]["style"] = {} + + for dfn in val.split(";"): + if not ":" in dfn: + continue + + att, val = dfn.split(":", 1) + + # print(tag, "style", att.strip(), val.strip()) + stack[-1][2]["style"][att.strip()] = val.strip() + else: + stack[-1][2][att] = val + + continue + + # Unclosed tags? + while stack: + ret.append(buildElem(stack.pop())) + + if ret: + if len(ret) > 1: + return ret + + return ret[0] + + return None + + ####################################################################### ## Interactive Elememts (Experimental. Omitted for now.) ####################################################################### @@ -1061,7 +1314,6 @@ def Textarea(*content, **attrs): ## Web Components (Experimental. Omitted for now.) ####################################################################### -# __pragma__('nokwargs') ## The 'skip' pragma tells the Transcrypt Python to JS transpiler to ## ignore a section of code. It's needed here because the 'run as script' @@ -1069,8 +1321,17 @@ def Textarea(*content, **attrs): ## Putting the pragmas in comments means they'll be ignored and cause no ## problems in a real python interpreter. -# __pragma__ ('skip') + if __name__ == '__main__': - import doctest - doctest.testmod() -# __pragma__ ('noskip') + print(parseHtml(""" + +
+ + + +