From 61d29b898f1170597b2cf39b297858ebb902a3c4 Mon Sep 17 00:00:00 2001
From: Jan Max Meyer <jmm@phorward.de>
Date: Tue, 18 Sep 2018 10:33:02 +0200
Subject: [PATCH 1/2] Simple HTML parser

---
 htmltree/htmltree.py | 263 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 261 insertions(+), 2 deletions(-)

diff --git a/htmltree/htmltree.py b/htmltree/htmltree.py
index f5cd95a..7da03b4 100644
--- a/htmltree/htmltree.py
+++ b/htmltree/htmltree.py
@@ -1053,6 +1053,251 @@ def Textarea(*content, **attrs):
     """
     return KWElement('textarea', *content, **attrs)
 
+#######################################################################
+## HTML parser
+#######################################################################
+
+# Global variables
+__tagsDict = None
+
+def __buildTagsDict():
+    """
+    Internal function for building a mapping of all available tags.
+    """
+    global __tagsDict
+
+    __tagsDict = {}
+    mod = __import__(__name__)
+
+    for clname in dir(mod):
+        cl = getattr(mod, clname)
+        if clname[0] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ" or not callable(cl) or cl is HtmlElement or cl is KWElement:
+            continue
+
+        try:
+            inst = cl()
+            if issubclass(inst.__class__, HtmlElement):
+                __tagsDict[cl.__name__.lower()] = cl
+        except:
+            pass
+
+    #print(__tagsDict)
+
+def parseHtml(html):
+    """
+	Parses the provided HTML code according to the objects defined in the htmltree-library.
+	"""
+
+    def scanWhite(l):
+        """
+		Scan and return whitespace.
+		"""
+
+        ret = ""
+        while l and l[0] in " \t\r\n\v":
+            ret += l.pop(0)
+
+        return ret
+
+    def scanWord(l):
+        """
+		Scan and return a word.
+		"""
+
+        ret = ""
+        while l and l[0] not in " \t\r\n\v" + "<>=\"'":
+            ret += l.pop(0)
+
+        return ret
+
+    def buildElem(elem):
+        try:
+            elem = elem[0](*elem[3], **elem[2])
+        except:
+            print(elem[2])
+            elem = elem[0](**elem[2])
+
+        return elem
+
+    global __tagsDict
+
+    # Obtain tag descriptions
+    if __tagsDict is None:
+        __buildTagsDict()
+
+    # Prepare stack and input
+    stack = []
+    ret = []
+    html = [ch for ch in html]
+
+    # Parse
+    while html:
+        tag = None
+        text = ""
+
+        # ugly...
+        singles = []
+        while stack and stack[-1][1] in ["br", "input", "img", "meta"]:
+            singles.append(buildElem(stack.pop()))
+
+        if stack:
+            stack[-1][3].extend(singles)
+        else:
+            ret.extend(singles)
+
+        while html:
+            #print("html", html)
+            #print(stack)
+
+            ch = html.pop(0)
+
+            # Comment
+            if html and ch == "<" and "".join(html[:3]) == "!--":
+                html = html[3:]
+                while html and "".join(html[:3]) != "-->":
+                    html.pop(0)
+
+                html = html[3:]
+
+            # Opening tag
+            elif html and ch == "<" and html[0] != "/":
+                # Append plain text (if not only whitespace)
+                if (text and ((len(text) == 1 and text in ["\t "])
+                              or not all([ch in " \t\r\n\v" for ch in text]))):
+                    if stack:
+                        stack[-1][3].append(text)
+                    else:
+                        ret.append(text)
+
+                tag = scanWord(html)
+                if tag.lower() in __tagsDict:
+                    break
+
+                text += ch + tag
+
+            # Closing tag
+            elif html and stack and stack[-1][1] and ch == "<" and html[0] == "/":
+                # Append plain text (if not only whitespace)
+                if (text and ((len(text) == 1 and text in ["\t "])
+                              or not all([ch in " \t\r\n\v" for ch in text]))):
+                    if stack:
+                        stack[-1][3].append(text)
+                    else:
+                        ret.append(text)
+
+                junk = ch
+                junk += html.pop(0)
+
+                tag = scanWord(html)
+                junk += tag
+
+                # print("/", tag.lower(), stack[-1][1].lower())
+                if stack[-1][1].lower() == tag.lower():
+                    junk += scanWhite(html)
+                    if html and html[0] == ">":
+                        html.pop(0)
+
+                        elem = buildElem(stack.pop())
+
+                        if not stack:
+                            ret.append(elem)
+                        else:
+                            stack[-1][3].append(elem)
+
+                        tag = None
+                        break
+
+                text += junk
+                tag = None
+
+            else:
+                text += ch
+
+        # Create tag
+        if tag and tag.lower() in __tagsDict:
+            #print(tag)
+            stack.append((__tagsDict[tag.lower()], tag.lower(), {}, []))
+
+            # print("tag", tag)
+
+            while html:
+                scanWhite(html)
+                if not html:
+                    break
+
+                # End of tag >
+                if html[0] == ">":
+                    html.pop(0)
+                    break
+
+                # Closing tag at end />
+                elif html[0] == "/":
+                    html.pop(0)
+                    scanWhite(html)
+
+                    if html[0] == ">":
+                        elem = buildElem(stack.pop())
+
+                        if not stack:
+                            ret.append(elem)
+                        else:
+                            stack[-1][3].append(elem)
+
+                        html.pop(0)
+                        break
+
+                att = scanWord(html).lower()
+                val = att
+
+                if not att:
+                    html.pop(0)
+                    continue
+
+                # Attribute
+                scanWhite(html)
+                if html[0] == "=":
+                    html.pop(0)
+                    scanWhite(html)
+
+                    if html[0] in "\"'":
+                        ch = html.pop(0)
+
+                        val = ""
+                        while html and html[0] != ch:
+                            val += html.pop(0)
+
+                        html.pop(0)
+
+                if att == "style":
+                    if "style" not in stack[-1][2]:
+                        stack[-1][2]["style"] = {}
+
+                    for dfn in val.split(";"):
+                        if not ":" in dfn:
+                            continue
+
+                        att, val = dfn.split(":", 1)
+
+                        # print(tag, "style", att.strip(), val.strip())
+                        stack[-1][2]["style"][att.strip()] = val.strip()
+                else:
+                    stack[-1][2][att] = val
+
+                continue
+
+    # Unclosed tags?
+    while stack:
+        ret.append(buildElem(stack.pop()))
+
+    if ret:
+        if len(ret) > 1:
+            return ret
+
+        return ret[0]
+
+    return None
+
+
 #######################################################################
 ## Interactive Elememts (Experimental. Omitted for now.)
 #######################################################################
@@ -1071,6 +1316,20 @@ def Textarea(*content, **attrs):
 
 # __pragma__ ('skip')
 if __name__ == '__main__':
-    import doctest
-    doctest.testmod()
+    #import doctest
+    #doctest.testmod()
+    #print(parseHtml(open("test.html", "r").read()).render())
+    print(parseHtml("""
+    <html>
+      <head>
+        <meta content="Your Name Here" name="author">
+      </head>
+      <body style="background-color:black;">
+        <h1 class="banner" style="color:green;">
+          Hello, htmltree!
+        </h1>
+      </body>
+    </html>
+    """).render())
+
 # __pragma__ ('noskip')

From 96c16edf9f18e3610387f1dfbee6bbf8d10ab1a4 Mon Sep 17 00:00:00 2001
From: Jan Max Meyer <jmm@phorward.de>
Date: Thu, 20 Sep 2018 13:49:30 +0200
Subject: [PATCH 2/2] Improved HTML parser to run buth with CPython and with
 Transcrypt

---
 htmltree/htmltree.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/htmltree/htmltree.py b/htmltree/htmltree.py
index 7da03b4..c608be1 100644
--- a/htmltree/htmltree.py
+++ b/htmltree/htmltree.py
@@ -1,4 +1,10 @@
 # -*- coding: utf-8 -*-
+
+# __pragma__('kwargs')
+# __pragma__('xglobs')
+# __pragma__('tconv')
+# __pragma__('opov')
+
 """
 Description: Provides a general html tree class, HtmlElement and wrapper
 functions for most standard non-obsolete HTML tags.
@@ -22,7 +28,7 @@
 Copyright 2017 Ellis & Grant, Inc.
 License: MIT License
 """
-# __pragma__('kwargs')
+
 
 def KWElement(tag, *content, **attrs):
     """
@@ -1065,23 +1071,24 @@ def __buildTagsDict():
     Internal function for building a mapping of all available tags.
     """
     global __tagsDict
-
     __tagsDict = {}
-    mod = __import__(__name__)
 
-    for clname in dir(mod):
-        cl = getattr(mod, clname)
-        if clname[0] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ" or not callable(cl) or cl is HtmlElement or cl is KWElement:
+    for cname in globals().keys():
+        if cname[0] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
             continue
 
+        cl = globals()[cname]
+
         try:
             inst = cl()
-            if issubclass(inst.__class__, HtmlElement):
-                __tagsDict[cl.__name__.lower()] = cl
+
+            if isinstance(inst, HtmlElement):
+                __tagsDict[cname.lower()] = cl
+
         except:
             pass
 
-    #print(__tagsDict)
+    # print(__tagsDict)
 
 def parseHtml(html):
     """
@@ -1246,7 +1253,8 @@ def buildElem(elem):
                         html.pop(0)
                         break
 
-                att = scanWord(html).lower()
+                att = scanWord(html)
+                att = att.lower() #fixme: This is split into two lines due to a Transcrypt bug
                 val = att
 
                 if not att:
@@ -1306,7 +1314,6 @@ def buildElem(elem):
 ## Web Components (Experimental. Omitted for now.)
 #######################################################################
 
-# __pragma__('nokwargs')
 
 ## The 'skip' pragma tells the Transcrypt Python to JS transpiler to
 ## ignore a section of code. It's needed here because the 'run as script'
@@ -1314,11 +1321,8 @@ def buildElem(elem):
 ## Putting the pragmas in comments means they'll be ignored and cause no
 ## problems in a real python interpreter.
 
-# __pragma__ ('skip')
+
 if __name__ == '__main__':
-    #import doctest
-    #doctest.testmod()
-    #print(parseHtml(open("test.html", "r").read()).render())
     print(parseHtml("""
     <html>
       <head>
@@ -1331,5 +1335,3 @@ def buildElem(elem):
       </body>
     </html>
     """).render())
-
-# __pragma__ ('noskip')