Links in question view are now parsed and links are re-written where possible to point to the stackdump instance. They are also styled differently to highlight this.

Images are also replaced with a placeholder.
2025-12-07 08:23:25 +00:00 · 2012-02-12 21:16:07 +11:00
parent 5bfcfd2f1a
commit 1da980424c
40 changed files with 12745 additions and 4 deletions
--- a/python/packages/html5lib/treewalkers/soup.py
+++ b/python/packages/html5lib/treewalkers/soup.py
@@ -0,0 +1,60 @@
+import re
+import gettext
+_ = gettext.gettext
+
+from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
+from html5lib.constants import namespaces
+import _base
+
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    doctype_regexp = re.compile(
+        r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
+    def getNodeDetails(self, node):
+        if isinstance(node, BeautifulSoup): # Document or DocumentFragment
+            return (_base.DOCUMENT,)
+
+        elif isinstance(node, Declaration): # DocumentType
+            string = unicode(node.string)
+            #Slice needed to remove markup added during unicode conversion,
+            #but only in some versions of BeautifulSoup/Python
+            if string.startswith('<!') and string.endswith('>'):
+                string = string[2:-1]
+            m = self.doctype_regexp.match(string)
+            #This regexp approach seems wrong and fragile
+            #but beautiful soup stores the doctype as a single thing and we want the seperate bits
+            #It should work as long as the tree is created by html5lib itself but may be wrong if it's
+            #been modified at all
+            #We could just feed to it a html5lib tokenizer, I guess...
+            assert m is not None, "DOCTYPE did not match expected format"
+
+            name = m.group('name')
+            publicId = m.group('publicId')
+            if publicId is not None:
+                systemId = m.group('systemId1')
+            else:
+                systemId = m.group('systemId2')
+            return _base.DOCTYPE, name, publicId or "", systemId or ""
+
+        elif isinstance(node, Comment):
+            string = unicode(node.string)
+            if string.startswith('<!--') and string.endswith('-->'):
+                string = string[4:-3]
+            return _base.COMMENT, string
+
+        elif isinstance(node, unicode): # TextNode
+            return _base.TEXT, node
+
+        elif isinstance(node, Tag): # Element
+            return (_base.ELEMENT, namespaces["html"], node.name,
+                    dict(node.attrs).items(), node.contents)
+        else:
+            return _base.UNKNOWN, node.__class__.__name__
+
+    def getFirstChild(self, node):
+        return node.contents[0]
+
+    def getNextSibling(self, node):
+        return node.nextSibling
+
+    def getParentNode(self, node):
+        return node.parent