1
0
mirror of https://github.com/djohnlewis/stackdump synced 2024-12-04 23:17:37 +00:00

Links in question view are now parsed and links are re-written where possible to point to the stackdump instance. They are also styled differently to highlight this.

Images are also replaced with a placeholder.
This commit is contained in:
Samuel Lai 2012-02-12 21:16:07 +11:00
parent 5bfcfd2f1a
commit 1da980424c
40 changed files with 12745 additions and 4 deletions

View File

@ -220,6 +220,10 @@ pre code {
margin-bottom: 7px;
}
.post-body .external-link {
color: #999999;
}
.post-metadata {
margin-top: 10px;
padding-top: 10px;

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.0 KiB

View File

@ -0,0 +1,17 @@
"""
HTML parsing library based on the WHATWG "HTML5"
specification. The parser is designed to be compatible with existing
HTML found in the wild and implements well-defined error recovery that
is largely compatible with modern desktop web browsers.
Example usage:
import html5lib
f = open("my_document.html")
tree = html5lib.parse(f)
"""
__version__ = "0.95-dev"
from html5parser import HTMLParser, parse, parseFragment
from treebuilders import getTreeBuilder
from treewalkers import getTreeWalker
from serializer import serialize

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,10 @@
class Filter(object):
def __init__(self, source):
self.source = source
def __iter__(self):
return iter(self.source)
def __getattr__(self, name):
return getattr(self.source, name)

View File

@ -0,0 +1,127 @@
#
# The goal is to finally have a form filler where you pass data for
# each form, using the algorithm for "Seeding a form with initial values"
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
#
import _base
from html5lib.constants import spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class SimpleFilter(_base.Filter):
def __init__(self, source, fieldStorage):
_base.Filter.__init__(self, source)
self.fieldStorage = fieldStorage
def __iter__(self):
field_indices = {}
state = None
field_name = None
for token in _base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
name = token["name"].lower()
if name == "input":
field_name = None
field_type = None
input_value_index = -1
input_checked_index = -1
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == u"name":
field_name = v.strip(spaceCharacters)
elif n == u"type":
field_type = v.strip(spaceCharacters)
elif n == u"checked":
input_checked_index = i
elif n == u"value":
input_value_index = i
value_list = self.fieldStorage.getlist(field_name)
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if field_type in (u"checkbox", u"radio"):
if value_list:
if token["data"][input_value_index][1] == value:
if input_checked_index < 0:
token["data"].append((u"checked", u""))
field_indices[field_name] = field_index + 1
elif input_checked_index >= 0:
del token["data"][input_checked_index]
elif field_type not in (u"button", u"submit", u"reset"):
if input_value_index >= 0:
token["data"][input_value_index] = (u"value", value)
else:
token["data"].append((u"value", value))
field_indices[field_name] = field_index + 1
field_type = None
field_name = None
elif name == "textarea":
field_type = "textarea"
field_name = dict((token["data"])[::-1])["name"]
elif name == "select":
field_type = "select"
attributes = dict(token["data"][::-1])
field_name = attributes.get("name")
is_select_multiple = "multiple" in attributes
is_selected_option_found = False
elif field_type == "select" and field_name and name == "option":
option_selected_index = -1
option_value = None
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == "selected":
option_selected_index = i
elif n == "value":
option_value = v.strip(spaceCharacters)
if option_value is None:
raise NotImplementedError("<option>s without a value= attribute")
else:
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if (is_select_multiple or not is_selected_option_found) and option_value == value:
if option_selected_index < 0:
token["data"].append((u"selected", u""))
field_indices[field_name] = field_index + 1
is_selected_option_found = True
elif option_selected_index >= 0:
del token["data"][option_selected_index]
elif field_type is not None and field_name and type == "EndTag":
name = token["name"].lower()
if name == field_type:
if name == "textarea":
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
yield {"type": "Characters", "data": value}
field_indices[field_name] = field_index + 1
field_name = None
elif name == "option" and field_type == "select":
pass # TODO: part of "option without value= attribute" processing
elif field_type == "textarea":
continue # ignore token
yield token

View File

@ -0,0 +1,62 @@
import _base
class Filter(_base.Filter):
def __init__(self, source, encoding):
_base.Filter.__init__(self, source)
self.encoding = encoding
def __iter__(self):
state = "pre_head"
meta_found = (self.encoding is None)
pending = []
for token in _base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag":
if token["name"].lower() == u"head":
state = "in_head"
elif type == "EmptyTag":
if token["name"].lower() == u"meta":
# replace charset with actual encoding
has_http_equiv_content_type = False
for (namespace,name),value in token["data"].iteritems():
if namespace != None:
continue
elif name.lower() == u'charset':
token["data"][(namespace,name)] = self.encoding
meta_found = True
break
elif name == u'http-equiv' and value.lower() == u'content-type':
has_http_equiv_content_type = True
else:
if has_http_equiv_content_type and (None, u"content") in token["data"]:
token["data"][(None, u"content")] = u'text/html; charset=%s' % self.encoding
meta_found = True
elif token["name"].lower() == u"head" and not meta_found:
# insert meta into empty head
yield {"type": "StartTag", "name": u"head",
"data": token["data"]}
yield {"type": "EmptyTag", "name": u"meta",
"data": {(None, u"charset"): self.encoding}}
yield {"type": "EndTag", "name": u"head"}
meta_found = True
continue
elif type == "EndTag":
if token["name"].lower() == u"head" and pending:
# insert meta into head (if necessary) and flush pending queue
yield pending.pop(0)
if not meta_found:
yield {"type": "EmptyTag", "name": u"meta",
"data": {(None, u"charset"): self.encoding}}
while pending:
yield pending.pop(0)
meta_found = True
state = "post_head"
if state == "in_head":
pending.append(token)
else:
yield token

View File

@ -0,0 +1,88 @@
from gettext import gettext
_ = gettext
import _base
from html5lib.constants import cdataElements, rcdataElements, voidElements
from html5lib.constants import spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class LintError(Exception): pass
class Filter(_base.Filter):
def __iter__(self):
open_elements = []
contentModelFlag = "PCDATA"
for token in _base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
if not isinstance(name, unicode):
raise LintError(_(u"Tag name is not a string: %r") % name)
if not name:
raise LintError(_(u"Empty tag name"))
if type == "StartTag" and name in voidElements:
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
elif type == "EmptyTag" and name not in voidElements:
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
if not isinstance(name, unicode):
raise LintError(_("Attribute name is not a string: %r") % name)
if not name:
raise LintError(_(u"Empty attribute name"))
if not isinstance(value, unicode):
raise LintError(_("Attribute value is not a string: %r") % value)
if name in cdataElements:
contentModelFlag = "CDATA"
elif name in rcdataElements:
contentModelFlag = "RCDATA"
elif name == "plaintext":
contentModelFlag = "PLAINTEXT"
elif type == "EndTag":
name = token["name"]
if not isinstance(name, unicode):
raise LintError(_(u"Tag name is not a string: %r") % name)
if not name:
raise LintError(_(u"Empty tag name"))
if name in voidElements:
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
start_name = open_elements.pop()
if start_name != name:
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
contentModelFlag = "PCDATA"
elif type == "Comment":
if contentModelFlag != "PCDATA":
raise LintError(_("Comment not in PCDATA content model flag"))
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
if not isinstance(data, unicode):
raise LintError(_("Attribute name is not a string: %r") % data)
if not data:
raise LintError(_(u"%s token with empty data") % type)
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
if not isinstance(name, unicode):
raise LintError(_(u"Tag name is not a string: %r") % name)
# XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"):
pass
else:
raise LintError(_(u"Unknown token type: %s") % type)
yield token

View File

@ -0,0 +1,202 @@
import _base
class Filter(_base.Filter):
def slider(self):
previous1 = previous2 = None
for token in self.source:
if previous1 is not None:
yield previous2, previous1, token
previous2 = previous1
previous1 = token
yield previous2, previous1, None
def __iter__(self):
for previous, token, next in self.slider():
type = token["type"]
if type == "StartTag":
if (token["data"] or
not self.is_optional_start(token["name"], previous, next)):
yield token
elif type == "EndTag":
if not self.is_optional_end(token["name"], next):
yield token
else:
yield token
def is_optional_start(self, tagname, previous, next):
type = next and next["type"] or None
if tagname in 'html':
# An html element's start tag may be omitted if the first thing
# inside the html element is not a space character or a comment.
return type not in ("Comment", "SpaceCharacters")
elif tagname == 'head':
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
# XXX: we also omit the start tag if the head element is empty
if type in ("StartTag", "EmptyTag"):
return True
elif type == "EndTag":
return next["name"] == "head"
elif tagname == 'body':
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
# except if the first thing inside the body element is a script
# or style element and the node immediately preceding the body
# element is a head element whose end tag has been omitted.
if type in ("Comment", "SpaceCharacters"):
return False
elif type == "StartTag":
# XXX: we do not look at the preceding event, so we never omit
# the body element's start tag if it's followed by a script or
# a style element.
return next["name"] not in ('script', 'style')
else:
return True
elif tagname == 'colgroup':
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
if type in ("StartTag", "EmptyTag"):
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
return next["name"] == "col"
else:
return False
elif tagname == 'tbody':
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == "StartTag":
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous['type'] == 'EndTag' and \
previous['name'] in ('tbody','thead','tfoot'):
return False
return next["name"] == 'tr'
else:
return False
return False
def is_optional_end(self, tagname, next):
type = next and next["type"] or None
if tagname in ('html', 'head', 'body'):
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return type not in ("Comment", "SpaceCharacters")
elif tagname in ('li', 'optgroup', 'tr'):
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] == tagname
else:
return type == "EndTag" or type is None
elif tagname in ('dt', 'dd'):
# A dt element's end tag may be omitted if the dt element is
# immediately followed by another dt element or a dd element.
# A dd element's end tag may be omitted if the dd element is
# immediately followed by another dd element or a dt element,
# or if there is no more content in the parent element.
if type == "StartTag":
return next["name"] in ('dt', 'dd')
elif tagname == 'dd':
return type == "EndTag" or type is None
else:
return False
elif tagname == 'p':
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, article, aside,
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
# nav, ol, p, pre, section, table, or ul, element, or if
# there is no more content in the parent element.
if type in ("StartTag", "EmptyTag"):
return next["name"] in ('address', 'article', 'aside',
'blockquote', 'datagrid', 'dialog',
'dir', 'div', 'dl', 'fieldset', 'footer',
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hr', 'menu', 'nav', 'ol',
'p', 'pre', 'section', 'table', 'ul')
else:
return type == "EndTag" or type is None
elif tagname == 'option':
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if it is immediately followed by an <code>optgroup</code>
# element, or if there is no more content in the parent
# element.
if type == "StartTag":
return next["name"] in ('option', 'optgroup')
else:
return type == "EndTag" or type is None
elif tagname in ('rt', 'rp'):
# An rt element's end tag may be omitted if the rt element is
# immediately followed by an rt or rp element, or if there is
# no more content in the parent element.
# An rp element's end tag may be omitted if the rp element is
# immediately followed by an rt or rp element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] in ('rt', 'rp')
else:
return type == "EndTag" or type is None
elif tagname == 'colgroup':
# A colgroup element's end tag may be omitted if the colgroup
# element is not immediately followed by a space character or
# a comment.
if type in ("Comment", "SpaceCharacters"):
return False
elif type == "StartTag":
# XXX: we also look for an immediately following colgroup
# element. See is_optional_start.
return next["name"] != 'colgroup'
else:
return True
elif tagname in ('thead', 'tbody'):
# A thead element's end tag may be omitted if the thead element
# is immediately followed by a tbody or tfoot element.
# A tbody element's end tag may be omitted if the tbody element
# is immediately followed by a tbody or tfoot element, or if
# there is no more content in the parent element.
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == "StartTag":
return next["name"] in ['tbody', 'tfoot']
elif tagname == 'tbody':
return type == "EndTag" or type is None
else:
return False
elif tagname == 'tfoot':
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == "StartTag":
return next["name"] == 'tbody'
else:
return type == "EndTag" or type is None
elif tagname in ('td', 'th'):
# A td element's end tag may be omitted if the td element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
# A th element's end tag may be omitted if the th element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] in ('td', 'th')
else:
return type == "EndTag" or type is None
return False

View File

@ -0,0 +1,8 @@
import _base
from html5lib.sanitizer import HTMLSanitizerMixin
class Filter(_base.Filter, HTMLSanitizerMixin):
def __iter__(self):
for token in _base.Filter.__iter__(self):
token = self.sanitize_token(token)
if token: yield token

View File

@ -0,0 +1,41 @@
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import ImmutableSet as frozenset
import re
import _base
from html5lib.constants import rcdataElements, spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
class Filter(_base.Filter):
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
def __iter__(self):
preserve = 0
for token in _base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag" \
and (preserve or token["name"] in self.spacePreserveElements):
preserve += 1
elif type == "EndTag" and preserve:
preserve -= 1
elif not preserve and type == "SpaceCharacters" and token["data"]:
# Test on token["data"] above to not introduce spaces where there were not
token["data"] = u" "
elif not preserve and type == "Characters":
token["data"] = collapse_spaces(token["data"])
yield token
def collapse_spaces(text):
return SPACES_REGEX.sub(' ', text)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,177 @@
import re
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
letter = " | ".join([baseChar, ideographic])
#Without the
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
extender])
nameFirst = " | ".join([letter, "_"])
reChar = re.compile(r"#x([\d|A-F]{4,4})")
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
def charStringToList(chars):
charRanges = [item.strip() for item in chars.split(" | ")]
rv = []
for item in charRanges:
foundMatch = False
for regexp in (reChar, reCharRange):
match = regexp.match(item)
if match is not None:
rv.append([hexToInt(item) for item in match.groups()])
if len(rv[-1]) == 1:
rv[-1] = rv[-1]*2
foundMatch = True
break
if not foundMatch:
assert len(item) == 1
rv.append([ord(item)] * 2)
rv = normaliseCharList(rv)
return rv
def normaliseCharList(charList):
charList = sorted(charList)
for item in charList:
assert item[1] >= item[0]
rv = []
i = 0
while i < len(charList):
j = 1
rv.append(charList[i])
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
rv[-1][1] = charList[i+j][1]
j += 1
i += j
return rv
#We don't really support characters above the BMP :(
max_unicode = int("FFFF", 16)
def missingRanges(charList):
rv = []
if charList[0] != 0:
rv.append([0, charList[0][0] - 1])
for i, item in enumerate(charList[:-1]):
rv.append([item[1]+1, charList[i+1][0] - 1])
if charList[-1][1] != max_unicode:
rv.append([charList[-1][1] + 1, max_unicode])
return rv
def listToRegexpStr(charList):
rv = []
for item in charList:
if item[0] == item[1]:
rv.append(escapeRegexp(unichr(item[0])))
else:
rv.append(escapeRegexp(unichr(item[0])) + "-" +
escapeRegexp(unichr(item[1])))
return "[%s]"%"".join(rv)
def hexToInt(hex_str):
return int(hex_str, 16)
def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-")
for char in specialCharacters:
string = string.replace(char, "\\" + char)
if char in string:
print string
return string
#output from the above
nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars = None,
dropXmlnsLocalName = False,
dropXmlnsAttrNs = False,
preventDoubleDashComments = False,
preventDashAtCommentEnd = False,
replaceFormFeedCharacters = True):
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
self.preventDoubleDashComments = preventDoubleDashComments
self.preventDashAtCommentEnd = preventDashAtCommentEnd
self.replaceFormFeedCharacters = replaceFormFeedCharacters
self.replaceCache = {}
def coerceAttribute(self, name, namespace=None):
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
#Need a datalosswarning here
return None
elif (self.dropXmlnsAttrNs and
namespace == "http://www.w3.org/2000/xmlns/"):
return None
else:
return self.toXmlName(name)
def coerceElement(self, name, namespace=None):
return self.toXmlName(name)
def coerceComment(self, data):
if self.preventDoubleDashComments:
while "--" in data:
data = data.replace("--", "- -")
return data
def coerceCharacters(self, data):
if self.replaceFormFeedCharacters:
data = data.replace("\x0C", " ")
#Other non-xml characters
return data
def toXmlName(self, name):
nameFirst = name[0]
nameRest = name[1:]
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
if m:
nameFirstOutput = self.getReplacementCharacter(nameFirst)
else:
nameFirstOutput = nameFirst
nameRestOutput = nameRest
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars:
replacement = self.getReplacementCharacter(char)
nameRestOutput = nameRestOutput.replace(char, replacement)
return nameFirstOutput + nameRestOutput
def getReplacementCharacter(self, char):
if char in self.replaceCache:
replacement = self.replaceCache[char]
else:
replacement = self.escapeChar(char)
return replacement
def fromXmlName(self, name):
for item in set(self.replacementRegexp.findall(name)):
name = name.replace(item, self.unescapeChar(item))
return name
def escapeChar(self, char):
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
self.replaceCache[char] = replacement
return replacement
def unescapeChar(self, charcode):
return unichr(int(charcode[1:], 16))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,258 @@
import re
from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer
from constants import tokenTypes
class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
'width', 'wrap', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
'opacity', 'orient', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
'transform', 'type', 'u1', 'u2', 'underline-position',
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
'mask', 'stroke']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
'set', 'use']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width']
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity']
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs' ]
# subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
allowed_css_properties = acceptable_css_properties
allowed_css_keywords = acceptable_css_keywords
allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
# accommodate filters which use token_type differently
token_type = token["type"]
if token_type in tokenTypes.keys():
token_type = tokenTypes[token_type]
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements:
if token.has_key("data"):
attrs = dict([(name,val) for name,val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if not attrs.has_key(attr):
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
#remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace(u"\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()]
return token
else:
if token_type == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"]=token["data"][:-1] + "/>"
if token["type"] in tokenTypes.keys():
token["type"] = "Characters"
else:
token["type"] = tokenTypes["Characters"]
del token["name"]
return token
elif token_type == tokenTypes["Comment"]:
pass
else:
return token
def sanitize_css(self, style):
# disallow urls
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
clean = []
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
if not value: continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background','border','margin',
'padding']:
for keyword in value.split():
if not keyword in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
#Change case matching defaults as we only output lowercase html anyway
#This solution doesn't seem ideal...
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName, parser=parser)
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token

View File

@ -0,0 +1,17 @@
from html5lib import treewalkers
from htmlserializer import HTMLSerializer
from xhtmlserializer import XHTMLSerializer
def serialize(input, tree="simpletree", format="html", encoding=None,
**serializer_opts):
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
if format == "html":
s = HTMLSerializer(**serializer_opts)
elif format == "xhtml":
s = XHTMLSerializer(**serializer_opts)
else:
raise ValueError, "type must be either html or xhtml"
return s.render(walker(input), encoding)

View File

@ -0,0 +1,312 @@
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import ImmutableSet as frozenset
import gettext
_ = gettext.gettext
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements, entities, xmlEntities
from html5lib import utils
from xml.sax.saxutils import escape
spaceCharacters = u"".join(spaceCharacters)
try:
from codecs import register_error, xmlcharrefreplace_errors
except ImportError:
unicode_encode_errors = "strict"
else:
unicode_encode_errors = "htmlentityreplace"
from html5lib.constants import entities
encode_entity_map = {}
is_ucs4 = len(u"\U0010FFFF") == 1
for k, v in entities.items():
#skip multi-character entities
if ((is_ucs4 and len(v) > 1) or
(not is_ucs4 and len(v) > 2)):
continue
if v != "&":
if len(v) == 2:
v = utils.surrogatePairToCodepoint(v)
else:
try:
v = ord(v)
except:
print v
raise
if not v in encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append("&#x%s;"%(hex(cp)[2:]))
return (u"".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
register_error(unicode_encode_errors, htmlentityreplace_errors)
del register_error
class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = False
quote_char = u'"'
use_best_quote_char = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
resolve_entities = True
# miscellaneous options
inject_meta_charset = True
strip_whitespace = False
sanitize = False
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata", "resolve_entities", "sanitize")
def __init__(self, **kwargs):
"""Initialize HTMLSerializer.
Keyword options (default given first unless specified) include:
inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the
document.
quote_attr_values=True|False
Whether to quote attribute values that don't require quoting
per HTML5 parsing rules.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
in which case single quotes are used instead.
escape_lt_in_attrs=False|True
Whether to escape < in attribute values.
escape_rcdata=False|True
Whether to escape characters that need to be escaped within normal
elements within rcdata elements such as style.
resolve_entities=True|False
Whether to resolve named character entities that appear in the
source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
are unaffected by this setting.
strip_whitespace=False|True
Whether to remove semantically meaningless whitespace. (This
compresses all whitespace to a single space except within pre.)
minimize_boolean_attributes=True|False
Shortens boolean attributes to give just the attribute value,
for example <input disabled="disabled"> becomes <input disabled>.
use_trailing_solidus=False|True
Includes a close-tag slash at the end of the start tag of void
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
space_before_trailing_solidus=True|False
Places a space immediately before the closing slash in a tag
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
sanitize=False|True
Strip all unsafe or unknown constructs from output.
See `html5lib user documentation`_
omit_optional_tags=True|False
Omit start/end tags that are optional.
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
"""
if kwargs.has_key('quote_char'):
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False
def encode(self, string):
assert(isinstance(string, unicode))
if self.encoding:
return string.encode(self.encoding, unicode_encode_errors)
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, unicode))
if self.encoding:
return string.encode(self.encoding, "strict")
else:
return string
def serialize(self, treewalker, encoding=None):
self.encoding = encoding
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
from html5lib.filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# XXX: WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
from html5lib.filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from html5lib.filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
from html5lib.filters.optionaltags import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = u"<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += u' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += u" SYSTEM"
if token["systemId"]:
if token["systemId"].find(u'"') >= 0:
if token["systemId"].find(u"'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters"))
quote_char = u"'"
else:
quote_char = u'"'
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += u">"
yield self.encodeStrict(doctype)
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError(_("Unexpected </ in CDATA"))
yield self.encode(token["data"])
else:
yield self.encode(escape(token["data"]))
elif type in ("StartTag", "EmptyTag"):
name = token["name"]
yield self.encodeStrict(u"<%s" % name)
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
attributes = []
for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()):
#TODO: Add namespace support here
k = attr_name
v = attr_value
yield self.encodeStrict(u' ')
yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple()) \
and k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict(u"=")
if self.quote_attr_values or not v:
quote_attr = True
else:
quote_attr = reduce(lambda x,y: x or (y in v),
spaceCharacters + u">\"'=", False)
v = v.replace(u"&", u"&amp;")
if self.escape_lt_in_attrs: v = v.replace(u"<", u"&lt;")
if quote_attr:
quote_char = self.quote_char
if self.use_best_quote_char:
if u"'" in v and u'"' not in v:
quote_char = u'"'
elif u'"' in v and u"'" not in v:
quote_char = u"'"
if quote_char == u"'":
v = v.replace(u"'", u"&#39;")
else:
v = v.replace(u'"', u"&quot;")
yield self.encodeStrict(quote_char)
yield self.encode(v)
yield self.encodeStrict(quote_char)
else:
yield self.encode(v)
if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus:
yield self.encodeStrict(u" /")
else:
yield self.encodeStrict(u"/")
yield self.encode(u">")
elif type == "EndTag":
name = token["name"]
if name in rcdataElements:
in_cdata = False
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
yield self.encodeStrict(u"</%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError(_("Comment contains --"))
yield self.encodeStrict(u"<!--%s-->" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if not key in entities:
self.serializeError(_("Entity %s not recognized" % name))
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = u"&%s;" % name
yield self.encodeStrict(data)
else:
self.serializeError(token["data"])
def render(self, treewalker, encoding=None):
if encoding:
return "".join(list(self.serialize(treewalker, encoding)))
else:
return u"".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append(data)
if self.strict:
raise SerializeError
def SerializeError(Exception):
"""Error in serialized tree"""
pass

View File

@ -0,0 +1,9 @@
from htmlserializer import HTMLSerializer
class XHTMLSerializer(HTMLSerializer):
quote_attr_values = True
minimize_boolean_attributes = False
use_trailing_solidus = True
escape_lt_in_attrs = True
omit_optional_tags = False
escape_rcdata = True

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,96 @@
"""A collection of modules for building different kinds of tree from
HTML documents.
To create a treebuilder for a new type of tree, you need to do
implement several things:
1) A set of classes for various types of elements: Document, Doctype,
Comment, Element. These must implement the interface of
_base.treebuilders.Node (although comment nodes have a different
signature for their constructor, see treebuilders.simpletree.Comment)
Textual content may also be implemented as another node type, or not, as
your tree implementation requires.
2) A treebuilder object (called TreeBuilder by convention) that
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
It also has one required method:
getDocument - Returns the root node of the complete document tree
3) If you wish to run the unit tests, you must also create a
testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according
to the format used in the unittests
The supplied simpletree module provides a python-only implementation
of a full treebuilder and is a useful reference for the semantics of
the various methods.
"""
treeBuilderCache = {}
import sys
def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are "simpletree", "dom", "etree" and "beautifulsoup"
"simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms.
"dom" - A generic builder for DOM implementations, defaulting to
a xml.dom.minidom based implementation for the sake of
backwards compatibility (as releases up until 0.10 had a
builder called "dom" that was a minidom implemenation).
"etree" - A generic builder for tree implementations exposing an
elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree).
"beautifulsoup" - Beautiful soup (if installed)
implementation - (Currently applies to the "etree" and "dom" tree types). A
module implementing the tree type e.g.
xml.etree.ElementTree or lxml.etree."""
treeType = treeType.lower()
if treeType not in treeBuilderCache:
if treeType == "dom":
import dom
# XXX: Keep backwards compatibility by using minidom if no implementation is given
if implementation == None:
from xml.dom import minidom
implementation = minidom
# XXX: NEVER cache here, caching is done in the dom submodule
return dom.getDomModule(implementation, **kwargs).TreeBuilder
elif treeType == "simpletree":
import simpletree
treeBuilderCache[treeType] = simpletree.TreeBuilder
elif treeType == "beautifulsoup":
import soup
treeBuilderCache[treeType] = soup.TreeBuilder
elif treeType == "lxml":
import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree":
# Come up with a sane default
if implementation == None:
try:
import xml.etree.cElementTree as ET
except ImportError:
try:
import xml.etree.ElementTree as ET
except ImportError:
try:
import cElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
implementation = ET
import etree
# NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
else:
raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
return treeBuilderCache.get(treeType)

View File

@ -0,0 +1,377 @@
from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
# The scope markers are inserted when entering object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, object elements, and marquees.
Marker = None
class Node(object):
def __init__(self, name):
"""Node representing an item in the tree.
name - The tag name associated with the node
parent - The parent of the current node (or None for the document node)
value - The value of the current node (applies to text nodes and
comments
attributes - a dict holding name, value pairs for attributes of the node
childNodes - a list of child nodes of the current node. This must
include all elements but not necessarily other node types
_flags - A list of miscellaneous flags that can be set on the node
"""
self.name = name
self.parent = None
self.value = None
self.attributes = {}
self.childNodes = []
self._flags = []
def __unicode__(self):
attributesStr = " ".join(["%s=\"%s\""%(name, value)
for name, value in
self.attributes.iteritems()])
if attributesStr:
return "<%s %s>"%(self.name,attributesStr)
else:
return "<%s>"%(self.name)
def __repr__(self):
return "<%s>" % (self.name)
def appendChild(self, node):
"""Insert node as a child of the current node
"""
raise NotImplementedError
def insertText(self, data, insertBefore=None):
"""Insert data as text in the current node, positioned before the
start of node insertBefore or to the end of the node's text.
"""
raise NotImplementedError
def insertBefore(self, node, refNode):
"""Insert node as a child of the current node, before refNode in the
list of child nodes. Raises ValueError if refNode is not a child of
the current node"""
raise NotImplementedError
def removeChild(self, node):
"""Remove node from the children of the current node
"""
raise NotImplementedError
def reparentChildren(self, newParent):
"""Move all the children of the current node to newParent.
This is needed so that trees that don't store text as nodes move the
text in the correct way
"""
#XXX - should this method be made more general?
for child in self.childNodes:
newParent.appendChild(child)
self.childNodes = []
def cloneNode(self):
"""Return a shallow copy of the current node i.e. a node with the same
name and attributes but with no parent or child nodes
"""
raise NotImplementedError
def hasContent(self):
"""Return true if the node has children or text, false otherwise
"""
raise NotImplementedError
class ActiveFormattingElements(list):
def append(self, node):
equalCount = 0
if node != Marker:
for element in self[::-1]:
if element == Marker:
break
if self.nodesEqual(element, node):
equalCount += 1
if equalCount == 3:
self.remove(element)
break
list.append(self, node)
def nodesEqual(self, node1, node2):
if not node1.nameTuple == node2.nameTuple:
return False
if not node1.attributes == node2.attributes:
return False
return True
class TreeBuilder(object):
"""Base treebuilder implementation
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
"""
#Document class
documentClass = None
#The class to use for creating a node
elementClass = None
#The class to use for creating comments
commentClass = None
#The class to use for creating doctypes
doctypeClass = None
#Fragment class
fragmentClass = None
def __init__(self, namespaceHTMLElements):
if namespaceHTMLElements:
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else:
self.defaultNamespace = None
self.reset()
def reset(self):
self.openElements = []
self.activeFormattingElements = ActiveFormattingElements()
#XXX - rename these to headElement, formElement
self.headPointer = None
self.formPointer = None
self.insertFromTable = False
self.document = self.documentClass()
def elementInScope(self, target, variant=None):
#If we pass a node in we match that. if we pass a string
#match any node with that name
exactNode = hasattr(target, "nameTuple")
listElementsMap = {
None:(scopingElements, False),
"button":(scopingElements | set([(namespaces["html"], "button")]), False),
"list":(scopingElements | set([(namespaces["html"], "ol"),
(namespaces["html"], "ul")]), False),
"table":(set([(namespaces["html"], "html"),
(namespaces["html"], "table")]), False),
"select":(set([(namespaces["html"], "optgroup"),
(namespaces["html"], "option")]), True)
}
listElements, invert = listElementsMap[variant]
for node in reversed(self.openElements):
if (node.name == target and not exactNode or
node == target and exactNode):
return True
elif (invert ^ (node.nameTuple in listElements)):
return False
assert False # We should never reach this point
def reconstructActiveFormattingElements(self):
# Within this algorithm the order of steps described in the
# specification is not quite the same as the order of steps in the
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
if not self.activeFormattingElements:
return
# Step 2 and step 3: we start with the last element. So i is -1.
i = len(self.activeFormattingElements) - 1
entry = self.activeFormattingElements[i]
if entry == Marker or entry in self.openElements:
return
# Step 6
while entry != Marker and entry not in self.openElements:
if i == 0:
#This will be reset to 0 below
i = -1
break
i -= 1
# Step 5: let entry be one earlier in the list.
entry = self.activeFormattingElements[i]
while True:
# Step 7
i += 1
# Step 8
entry = self.activeFormattingElements[i]
clone = entry.cloneNode() #Mainly to get a new copy of the attributes
# Step 9
element = self.insertElement({"type":"StartTag",
"name":clone.name,
"namespace":clone.namespace,
"data":clone.attributes})
# Step 10
self.activeFormattingElements[i] = element
# Step 11
if element == self.activeFormattingElements[-1]:
break
def clearActiveFormattingElements(self):
entry = self.activeFormattingElements.pop()
while self.activeFormattingElements and entry != Marker:
entry = self.activeFormattingElements.pop()
def elementInActiveFormattingElements(self, name):
"""Check if an element exists between the end of the active
formatting elements and the last marker. If it does, return it, else
return false"""
for item in self.activeFormattingElements[::-1]:
# Check for Marker first because if it's a Marker it doesn't have a
# name attribute.
if item == Marker:
break
elif item.name == name:
return item
return False
def insertRoot(self, token):
element = self.createElement(token)
self.openElements.append(element)
self.document.appendChild(element)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = self.doctypeClass(name, publicId, systemId)
self.document.appendChild(doctype)
def insertComment(self, token, parent=None):
if parent is None:
parent = self.openElements[-1]
parent.appendChild(self.commentClass(token["data"]))
def createElement(self, token):
"""Create an element but don't insert it anywhere"""
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
return element
def _getInsertFromTable(self):
return self._insertFromTable
def _setInsertFromTable(self, value):
"""Switch the function used to insert an element from the
normal one to the misnested table one and back again"""
self._insertFromTable = value
if value:
self.insertElement = self.insertElementTable
else:
self.insertElement = self.insertElementNormal
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
def insertElementNormal(self, token):
name = token["name"]
assert type(name) == unicode, "Element %s not unicode"%name
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
self.openElements[-1].appendChild(element)
self.openElements.append(element)
return element
def insertElementTable(self, token):
"""Create an element and insert it into the tree"""
element = self.createElement(token)
if self.openElements[-1].name not in tableInsertModeElements:
return self.insertElementNormal(token)
else:
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition()
if insertBefore is None:
parent.appendChild(element)
else:
parent.insertBefore(element, insertBefore)
self.openElements.append(element)
return element
def insertText(self, data, parent=None):
"""Insert text data."""
if parent is None:
parent = self.openElements[-1]
if (not self.insertFromTable or (self.insertFromTable and
self.openElements[-1].name
not in tableInsertModeElements)):
parent.insertText(data)
else:
# We should be in the InTable mode. This means we want to do
# special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition()
parent.insertText(data, insertBefore)
def getTableMisnestedNodePosition(self):
"""Get the foster parent element, and sibling to insert before
(or None) when inserting a misnested table node"""
# The foster parent element is the one which comes before the most
# recently opened table element
# XXX - this is really inelegant
lastTable=None
fosterParent = None
insertBefore = None
for elm in self.openElements[::-1]:
if elm.name == "table":
lastTable = elm
break
if lastTable:
# XXX - we should really check that this parent is actually a
# node here
if lastTable.parent:
fosterParent = lastTable.parent
insertBefore = lastTable
else:
fosterParent = self.openElements[
self.openElements.index(lastTable) - 1]
else:
fosterParent = self.openElements[0]
return fosterParent, insertBefore
def generateImpliedEndTags(self, exclude=None):
name = self.openElements[-1].name
# XXX td, th and tr are not actually needed
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
and name != exclude):
self.openElements.pop()
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
self.generateImpliedEndTags(exclude)
def getDocument(self):
"Return the final tree"
return self.document
def getFragment(self):
"Return the final fragment"
#assert self.innerHTML
fragment = self.fragmentClass()
self.openElements[0].reparentChildren(fragment)
return fragment
def testSerializer(self, node):
"""Serialize the subtree of node in the format required by unit tests
node - the node from which to start serializing"""
raise NotImplementedError

View File

@ -0,0 +1,291 @@
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
try:
from types import ModuleType
except:
from new import module as ModuleType
import re
import weakref
import _base
from html5lib import constants, ihatexml
from html5lib.constants import namespaces
moduleCache = {}
def getDomModule(DomImplementation):
name = "_" + DomImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = ModuleType(name)
objs = getDomBuilder(DomImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getDomBuilder(DomImplementation):
Dom = DomImplementation
class AttrList(object):
def __init__(self, element):
self.element = element
def __iter__(self):
return self.element.attributes.items().__iter__()
def __setitem__(self, name, value):
self.element.setAttribute(name, value)
def __len__(self):
return len(self.element.attributes.items())
def items(self):
return [(item[0], item[1]) for item in
self.element.attributes.items()]
def keys(self):
return self.element.attributes.keys()
def __getitem__(self, name):
return self.element.getAttribute(name)
def __contains__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
return self.element.hasAttribute(name)
class NodeBuilder(_base.Node):
def __init__(self, element):
_base.Node.__init__(self, element.nodeName)
self.element = element
namespace = property(lambda self:hasattr(self.element, "namespaceURI")
and self.element.namespaceURI or None)
def appendChild(self, node):
node.parent = self
self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None):
text = self.element.ownerDocument.createTextNode(data)
if insertBefore:
self.element.insertBefore(text, insertBefore.element)
else:
self.element.appendChild(text)
def insertBefore(self, node, refNode):
self.element.insertBefore(node.element, refNode.element)
node.parent = self
def removeChild(self, node):
if node.element.parentNode == self.element:
self.element.removeChild(node.element)
node.parent = None
def reparentChildren(self, newParent):
while self.element.hasChildNodes():
child = self.element.firstChild
self.element.removeChild(child)
newParent.element.appendChild(child)
self.childNodes = []
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes:
for name, value in attributes.items():
if isinstance(name, tuple):
if name[0] is not None:
qualifiedName = (name[0] + ":" + name[1])
else:
qualifiedName = name[1]
self.element.setAttributeNS(name[2], qualifiedName,
value)
else:
self.element.setAttribute(
name, value)
attributes = property(getAttributes, setAttributes)
def cloneNode(self):
return NodeBuilder(self.element.cloneNode(False))
def hasContent(self):
return self.element.hasChildNodes()
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
return weakref.proxy(self)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
domimpl = Dom.getDOMImplementation()
doctype = domimpl.createDocumentType(name, publicId, systemId)
self.document.appendChild(NodeBuilder(doctype))
if Dom == minidom:
doctype.ownerDocument = self.dom
def elementClass(self, name, namespace=None):
if namespace is None and self.defaultNamespace is None:
node = self.dom.createElement(name)
else:
node = self.dom.createElementNS(namespace, name)
return NodeBuilder(node)
def commentClass(self, data):
return NodeBuilder(self.dom.createComment(data))
def fragmentClass(self):
return NodeBuilder(self.dom.createDocumentFragment())
def appendChild(self, node):
self.dom.appendChild(node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.dom
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data=data
if parent <> self:
_base.TreeBuilder.insertText(self, data, parent)
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
if not Node.TEXT_NODE in self.dom._child_node_types:
self.dom._child_node_types=list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data))
name = None
def testSerializer(element):
element.normalize()
rv = []
def serializeElement(element, indent=0):
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
if element.name:
if element.publicId or element.systemId:
publicId = element.publicId or ""
systemId = element.systemId or ""
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
' '*indent, element.name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
else:
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
elif element.nodeType == Node.DOCUMENT_NODE:
rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
elif element.nodeType == Node.TEXT_NODE:
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
else:
if (hasattr(element, "namespaceURI") and
element.namespaceURI != None):
name = "%s %s"%(constants.prefixes[element.namespaceURI],
element.nodeName)
else:
name = element.nodeName
rv.append("|%s<%s>"%(' '*indent, name))
if element.hasAttributes():
attributes = []
for i in range(len(element.attributes)):
attr = element.attributes.item(i)
name = attr.nodeName
value = attr.value
ns = attr.namespaceURI
if ns:
name = "%s %s"%(constants.prefixes[ns], attr.localName)
else:
name = attr.nodeName
attributes.append((name, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
for child in element.childNodes:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else:
attributes = dict(node.attributes.itemsNS())
# gather namespace declarations
prefixes = []
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.nodeName)]
# apply namespace declarations
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if nsmap.has_key(prefix):
del attributes[(attr.namespaceURI, attr.nodeName)]
attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
# SAX events
ns = node.namespaceURI or nsmap.get(None,None)
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes: handler.endPrefixMapping(prefix)
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
handler.characters(node.nodeValue)
elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endDocument()
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes: dom2sax(child, handler, nsmap)
else:
# ATTRIBUTE_NODE
# ENTITY_NODE
# PROCESSING_INSTRUCTION_NODE
# COMMENT_NODE
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass
return locals()
# Keep backwards compatibility with things that directly load
# classes/functions from this module
for key, value in getDomModule(minidom).__dict__.items():
globals()[key] = value

View File

@ -0,0 +1,344 @@
try:
from types import ModuleType
except:
from new import module as ModuleType
import re
import types
import _base
from html5lib import ihatexml
from html5lib import constants
from html5lib.constants import namespaces
tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {}
def getETreeModule(ElementTreeImplementation, fullTree=False):
name = "_" + ElementTreeImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder")
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
class Element(_base.Node):
def __init__(self, name, namespace=None):
self._name = name
self._namespace = namespace
self._element = ElementTree.Element(self._getETreeTag(name,
namespace))
if namespace is None:
self.nameTuple = namespaces["html"], self._name
else:
self.nameTuple = self._namespace, self._name
self.parent = None
self._childNodes = []
self._flags = []
def _getETreeTag(self, name, namespace):
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s"%(namespace, name)
return etree_tag
def _setName(self, name):
self._name = name
self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getName(self):
return self._name
name = property(_getName, _setName)
def _setNamespace(self, namespace):
self._namespace = namespace
self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getNamespace(self):
return self._namespace
namespace = property(_getNamespace, _setNamespace)
def _getAttributes(self):
return self._element.attrib
def _setAttributes(self, attributes):
#Delete existing attributes first
#XXX - there may be a better way to do this...
for key in self._element.attrib.keys():
del self._element.attrib[key]
for key, value in attributes.iteritems():
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], key[1])
else:
name = key
self._element.set(name, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
for element in value:
self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self._element.text or len(self._element))
def appendChild(self, node):
self._childNodes.append(node)
self._element.append(node._element)
node.parent = self
def insertBefore(self, node, refNode):
index = list(self._element).index(refNode._element)
self._element.insert(index, node._element)
node.parent = self
def removeChild(self, node):
self._element.remove(node._element)
node.parent=None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
if not self._element.text:
self._element.text = ""
self._element.text += data
elif insertBefore is None:
#Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
else:
#Insert the text before the specified node
children = list(self._element)
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index-1].tail:
self._element[index-1].tail = ""
self._element[index-1].tail += data
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
def cloneNode(self):
element = type(self)(self.name, self.namespace)
for name, value in self.attributes.iteritems():
element.attributes[name] = value
return element
def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
else:
if not newParent._element.text:
newParent._element.text = ""
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
#Use the superclass constructor to set all properties on the
#wrapper element
self._element = ElementTree.Comment(data)
self.parent = None
self._childNodes = []
self._flags = []
def _getData(self):
return self._element.text
def _setData(self, value):
self._element.text = value
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name, publicId, systemId):
Element.__init__(self, "<!DOCTYPE>")
self._element.text = name
self.publicId = publicId
self.systemId = systemId
def _getPublicId(self):
return self._element.get(u"publicId", "")
def _setPublicId(self, value):
if value is not None:
self._element.set(u"publicId", value)
publicId = property(_getPublicId, _setPublicId)
def _getSystemId(self):
return self._element.get(u"systemId", "")
def _setSystemId(self, value):
if value is not None:
self._element.set(u"systemId", value)
systemId = property(_getSystemId, _setSystemId)
class Document(Element):
def __init__(self):
Element.__init__(self, "<DOCUMENT_ROOT>")
class DocumentFragment(Element):
def __init__(self):
Element.__init__(self, "<DOCUMENT_FRAGMENT>")
def testSerializer(element):
rv = []
finalText = None
def serializeElement(element, indent=0):
if not(hasattr(element, "tag")):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>":
rv.append("#document")
if element.text:
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
if element.tail:
finalText = element.tail
elif element.tag == ElementTree.Comment:
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
assert type(element.tag) in types.StringTypes, "Expected unicode, got %s"%type(element.tag)
nsmatch = tag_regexp.match(element.tag)
if nsmatch is None:
name = element.tag
else:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
name = "%s %s"%(prefix, name)
rv.append("|%s<%s>"%(' '*indent, name))
if hasattr(element, "attrib"):
attributes = []
for name, value in element.attrib.iteritems():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
attr_string = "%s %s"%(prefix, name)
else:
attr_string = name
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element:
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
filter = ihatexml.InfosetFilter()
def serializeElement(element):
if type(element) == type(ElementTree.ElementTree):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>":
if element.text:
rv.append(element.text)
if element.tail:
finalText = element.tail
for child in element:
serializeElement(child)
elif type(element.tag) == type(ElementTree.Comment):
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(filter.fromXmlName(element.tag),))
else:
attr = " ".join(["%s=\"%s\""%(
filter.fromXmlName(name), value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element:
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
fragmentClass = DocumentFragment
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._element
else:
if self.defaultNamespace is not None:
return self.document._element.find(
"{%s}html"%self.defaultNamespace)
else:
return self.document._element.find("html")
def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element
return locals()

View File

@ -0,0 +1,336 @@
import warnings
import re
import _base
from html5lib.constants import DataLossWarning
import html5lib.constants as constants
import etree as etree_builders
from html5lib import ihatexml
try:
import lxml.etree as etree
except ImportError:
pass
fullTree = True
tag_regexp = re.compile("{([^}]*)}(.*)")
"""Module for supporting the lxml.etree library. The idea here is to use as much
of the native library as possible, without using fragile hacks like custom element
names that break between releases. The downside of this is that we cannot represent
all possible trees; specifically the following are known to cause problems:
Text or comments as siblings of the root element
Docypes with no name
When any of these things occur, we emit a DataLossWarning
"""
class DocumentType(object):
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document(object):
def __init__(self):
self._elementTree = None
self._childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
def _getChildNodes(self):
return self._childNodes
childNodes = property(_getChildNodes)
def testSerializer(element):
rv = []
finalText = None
filter = ihatexml.InfosetFilter()
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
if hasattr(element, "getroot"):
#Full tree case
rv.append("#document")
if element.docinfo.internalDTD:
if not (element.docinfo.public_id or
element.docinfo.system_url):
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
else:
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
element.docinfo.root_name,
element.docinfo.public_id,
element.docinfo.system_url)
rv.append("|%s%s"%(' '*(indent+2), dtd_str))
next_element = element.getroot()
while next_element.getprevious() is not None:
next_element = next_element.getprevious()
while next_element is not None:
serializeElement(next_element, indent+2)
next_element = next_element.getnext()
elif isinstance(element, basestring):
#Text in a fragment
rv.append("|%s\"%s\""%(' '*indent, element))
else:
#Fragment case
rv.append("#document-fragment")
for next_element in element:
serializeElement(next_element, indent+2)
elif type(element.tag) == type(etree.Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append("|%s<%s %s>"%(' '*indent, prefix,
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
attributes = []
for name, value in element.attrib.iteritems():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
name = filter.fromXmlName(name)
prefix = constants.prefixes[ns]
attr_string = "%s %s"%(prefix, name)
else:
attr_string = filter.fromXmlName(name)
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element.getchildren():
serializeElement(child, indent)
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if not hasattr(element, "tag"):
if element.docinfo.internalDTD:
if element.docinfo.doctype:
dtd_str = element.docinfo.doctype
else:
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
rv.append(dtd_str)
serializeElement(element.getroot())
elif type(element.tag) == type(etree.Comment):
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if hasattr(element, "tail") and element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
def __init__(self, namespaceHTMLElements, fullTree = False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
filter = self.filter = ihatexml.InfosetFilter()
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value={}):
self._element = element
dict.__init__(self, value)
for key, value in self.iteritems():
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
else:
name = filter.coerceAttribute(key)
self._element._element.attrib[name] = value
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
else:
name = filter.coerceAttribute(key)
self._element._element.attrib[name] = value
class Element(builder.Element):
def __init__(self, name, namespace):
name = filter.coerceElement(name)
builder.Element.__init__(self, name, namespace=namespace)
self._attributes = Attributes(self)
def _setName(self, name):
self._name = filter.coerceElement(name)
self._element.tag = self._getETreeTag(
self._name, self._namespace)
def _getName(self):
return filter.fromXmlName(self._name)
name = property(_getName, _setName)
def _getAttributes(self):
return self._attributes
def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes)
attributes = property(_getAttributes, _setAttributes)
def insertText(self, data, insertBefore=None):
data = filter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child):
builder.Element.appendChild(self, child)
class Comment(builder.Comment):
def __init__(self, data):
data = filter.coerceComment(data)
builder.Comment.__init__(self, data)
def _setData(self, data):
data = filter.coerceComment(data)
self._element.text = data
def _getData(self):
return self._element.text
data = property(_getData, _setData)
self.elementClass = Element
self.commentClass = builder.Comment
#self.fragmentClass = builder.DocumentFragment
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
_base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._elementTree
else:
return self.document._elementTree.getroot()
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(element.getchildren())
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
self.initial_comments.append(data)
def insertRoot(self, token):
"""Create the document root"""
#Because of the way libxml2 works, it doesn't seem to be possible to
#alter information like the doctype after the tree has been parsed.
#Therefore we need to use the built-in parser to create our iniial
#tree, after which we can add elements like normal
docStr = ""
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
docStr += "<!DOCTYPE %s"%self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
self.doctype.systemId or "")
docStr += ">"
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
try:
root = etree.fromstring(docStr)
except etree.XMLSyntaxError:
print docStr
raise
#Append the initial comments:
for comment_token in self.initial_comments:
root.addprevious(etree.Comment(comment_token["data"]))
#Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
# Give the root element the right name
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s"%(namespace, name)
root.tag = etree_tag
#Add the root element to the internal child/open data structures
root_element = self.elementClass(name, namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)
#Reset to the default insert comment function
self.insertComment = super(TreeBuilder, self).insertComment

View File

@ -0,0 +1,256 @@
import _base
from html5lib.constants import voidElements, namespaces, prefixes
from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing
class Node(_base.Node):
type = -1
def __init__(self, name):
self.name = name
self.parent = None
self.value = None
self.childNodes = []
self._flags = []
def __iter__(self):
for node in self.childNodes:
yield node
for item in node:
yield item
def __unicode__(self):
return self.name
def toxml(self):
raise NotImplementedError
def printTree(self, indent=0):
tree = '\n|%s%s' % (' '* indent, unicode(self))
for child in self.childNodes:
tree += child.printTree(indent + 2)
return tree
def appendChild(self, node):
assert isinstance(node, Node)
if (isinstance(node, TextNode) and self.childNodes and
isinstance(self.childNodes[-1], TextNode)):
self.childNodes[-1].value += node.value
else:
self.childNodes.append(node)
node.parent = self
def insertText(self, data, insertBefore=None):
assert isinstance(data, unicode), "data %s is of type %s expected unicode"%(repr(data), type(data))
if insertBefore is None:
self.appendChild(TextNode(data))
else:
self.insertBefore(TextNode(data), insertBefore)
def insertBefore(self, node, refNode):
index = self.childNodes.index(refNode)
if (isinstance(node, TextNode) and index > 0 and
isinstance(self.childNodes[index - 1], TextNode)):
self.childNodes[index - 1].value += node.value
else:
self.childNodes.insert(index, node)
node.parent = self
def removeChild(self, node):
try:
self.childNodes.remove(node)
except:
# XXX
raise
node.parent = None
def cloneNode(self):
raise NotImplementedError
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self.childNodes)
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class Document(Node):
type = 1
def __init__(self):
Node.__init__(self, None)
def __str__(self):
return "#document"
def __unicode__(self):
return str(self)
def appendChild(self, child):
Node.appendChild(self, child)
def toxml(self, encoding="utf=8"):
result = ""
for child in self.childNodes:
result += child.toxml()
return result.encode(encoding)
def hilite(self, encoding="utf-8"):
result = "<pre>"
for child in self.childNodes:
result += child.hilite()
return result.encode(encoding) + "</pre>"
def printTree(self):
tree = unicode(self)
for child in self.childNodes:
tree += child.printTree(2)
return tree
def cloneNode(self):
return Document()
class DocumentFragment(Document):
type = 2
def __str__(self):
return "#document-fragment"
def __unicode__(self):
return str(self)
def cloneNode(self):
return DocumentFragment()
class DocumentType(Node):
type = 3
def __init__(self, name, publicId, systemId):
Node.__init__(self, name)
self.publicId = publicId
self.systemId = systemId
def __unicode__(self):
if self.publicId or self.systemId:
publicId = self.publicId or ""
systemId = self.systemId or ""
return """<!DOCTYPE %s "%s" "%s">"""%(
self.name, publicId, systemId)
else:
return u"<!DOCTYPE %s>" % self.name
toxml = __unicode__
def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
def cloneNode(self):
return DocumentType(self.name, self.publicId, self.systemId)
class TextNode(Node):
type = 4
def __init__(self, value):
Node.__init__(self, None)
self.value = value
def __unicode__(self):
return u"\"%s\"" % self.value
def toxml(self):
return escape(self.value)
hilite = toxml
def cloneNode(self):
return TextNode(self.value)
class Element(Node):
type = 5
def __init__(self, name, namespace=None):
Node.__init__(self, name)
self.namespace = namespace
self.attributes = {}
def __unicode__(self):
if self.namespace == None:
return u"<%s>" % self.name
else:
return u"<%s %s>"%(prefixes[self.namespace], self.name)
def toxml(self):
result = '<' + self.name
if self.attributes:
for name,value in self.attributes.iteritems():
result += u' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
if self.childNodes:
result += '>'
for child in self.childNodes:
result += child.toxml()
result += u'</%s>' % self.name
else:
result += u'/>'
return result
def hilite(self):
result = '&lt;<code class="markup element-name">%s</code>' % self.name
if self.attributes:
for name, value in self.attributes.iteritems():
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
if self.childNodes:
result += ">"
for child in self.childNodes:
result += child.hilite()
elif self.name in voidElements:
return result + ">"
return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
def printTree(self, indent):
tree = '\n|%s%s' % (' '*indent, unicode(self))
indent += 2
if self.attributes:
for name, value in sorted(self.attributes.iteritems()):
if isinstance(name, tuple):
name = "%s %s"%(name[0], name[1])
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)
return tree
def cloneNode(self):
newNode = Element(self.name)
if hasattr(self, 'namespace'):
newNode.namespace = self.namespace
for attr, value in self.attributes.iteritems():
newNode.attributes[attr] = value
return newNode
class CommentNode(Node):
type = 6
def __init__(self, data):
Node.__init__(self, None)
self.data = data
def __unicode__(self):
return "<!-- %s -->" % self.data
def toxml(self):
return "<!--%s-->" % self.data
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
def cloneNode(self):
return CommentNode(self.data)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = CommentNode
fragmentClass = DocumentFragment
def testSerializer(self, node):
return node.printTree()

View File

@ -0,0 +1,236 @@
import warnings
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
import _base
from html5lib.constants import namespaces, DataLossWarning
class AttrList(object):
def __init__(self, element):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self):
return self.attrs.items().__iter__()
def __setitem__(self, name, value):
"set attr", name, value
self.element[name] = value
def items(self):
return self.attrs.items()
def keys(self):
return self.attrs.keys()
def __getitem__(self, name):
return self.attrs[name]
def __contains__(self, name):
return name in self.attrs.keys()
def __eq__(self, other):
if len(self.keys()) != len(other.keys()):
return False
for item in self.keys():
if item not in other:
return False
if self[item] != other[item]:
return False
return True
class Element(_base.Node):
def __init__(self, element, soup, namespace):
_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def _nodeIndex(self, node, refNode):
# Finds a node by identity rather than equality
for index in range(len(self.element.contents)):
if id(self.element.contents[index]) == id(refNode.element):
return index
return None
def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# Concatenate new text onto old text node
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
newStr = NavigableString(self.element.contents[-1]+node.element)
# Remove the old text node
# (Can't simply use .extract() by itself, because it fails if
# an equal text node exists within the parent node)
oldElement = self.element.contents[-1]
del self.element.contents[-1]
oldElement.parent = None
oldElement.extract()
self.element.insert(len(self.element.contents), newStr)
else:
self.element.insert(len(self.element.contents), node.element)
node.parent = self
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes:
for name, value in attributes.items():
self.element[name] = value
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
text = TextNode(NavigableString(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self._nodeIndex(node, refNode)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
# (See comments in appendChild)
newStr = NavigableString(self.element.contents[index-1]+node.element)
oldNode = self.element.contents[index-1]
del self.element.contents[index-1]
oldNode.parent = None
oldNode.extract()
self.element.insert(index-1, newStr)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
index = self._nodeIndex(node.parent, node)
del node.parent.element.contents[index]
node.element.parent = None
node.element.extract()
node.parent = None
def reparentChildren(self, newParent):
while self.element.contents:
child = self.element.contents[0]
child.extract()
if isinstance(child, Tag):
newParent.appendChild(Element(child, self.soup, namespaces["html"]))
else:
newParent.appendChild(TextNode(child, self.soup))
def cloneNode(self):
node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
def hasContent(self):
return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element):
def __init__(self, element, soup):
_base.Node.__init__(self, None)
self.element = element
self.soup = soup
def cloneNode(self):
raise NotImplementedError
class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements):
if namespaceHTMLElements:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
def documentClass(self):
self.soup = BeautifulSoup("")
return Element(self.soup, self.soup, None)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if publicId:
self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
elif systemId:
self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""%
(name, systemId)))
else:
self.soup.insert(0, Declaration("DOCTYPE %s"%name))
def elementClass(self, name, namespace):
if namespace is not None:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
return Element(Tag(self.soup, name), self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
def appendChild(self, node):
self.soup.insert(len(self.soup.contents), node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.soup
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def testSerializer(element):
import re
rv = []
def serializeElement(element, indent=0):
if isinstance(element, Declaration):
doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
m = re.compile(doctype_regexp).match(element.string)
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1') or ""
else:
systemId = m.group('systemId2')
if publicId is not None or systemId is not None:
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
(' '*indent, name, publicId or "", systemId or ""))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
elif isinstance(element, BeautifulSoup):
if element.name == "[document_fragment]":
rv.append("#document-fragment")
else:
rv.append("#document")
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
elif isinstance(element, unicode):
rv.append("|%s\"%s\"" %(' '*indent, element))
else:
rv.append("|%s<%s>"%(' '*indent, element.name))
if element.attrs:
for name, value in sorted(element.attrs):
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
if hasattr(element, "contents"):
for child in element.contents:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)

View File

@ -0,0 +1,52 @@
"""A collection of modules for iterating through different kinds of
tree, generating tokens identical to those produced by the tokenizer
module.
To create a tree walker for a new type of tree, you need to do
implement a tree walker object (called TreeWalker by convention) that
implements a 'serialize' method taking a tree as sole argument and
returning an iterator generating tokens.
"""
treeWalkerCache = {}
def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are "simpletree", "dom", "etree" and "beautifulsoup"
"simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms.
"dom" - The xml.dom.minidom DOM implementation
"pulldom" - The xml.dom.pulldom event stream
"etree" - A generic walker for tree implementations exposing an
elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree).
"lxml" - Optimized walker for lxml.etree
"beautifulsoup" - Beautiful soup (if installed)
"genshi" - a Genshi stream
implementation - (Currently applies to the "etree" tree type only). A module
implementing the tree type e.g. xml.etree.ElementTree or
cElementTree."""
treeType = treeType.lower()
if treeType not in treeWalkerCache:
if treeType in ("dom", "pulldom", "simpletree"):
mod = __import__(treeType, globals())
treeWalkerCache[treeType] = mod.TreeWalker
elif treeType == "genshi":
import genshistream
treeWalkerCache[treeType] = genshistream.TreeWalker
elif treeType == "beautifulsoup":
import soup
treeWalkerCache[treeType] = soup.TreeWalker
elif treeType == "lxml":
import lxmletree
treeWalkerCache[treeType] = lxmletree.TreeWalker
elif treeType == "etree":
import etree
# XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType)

View File

@ -0,0 +1,176 @@
import gettext
_ = gettext.gettext
from html5lib.constants import voidElements, spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class TreeWalker(object):
def __init__(self, tree):
self.tree = tree
def __iter__(self):
raise NotImplementedError
def error(self, msg):
return {"type": "SerializeError", "data": msg}
def normalizeAttrs(self, attrs):
newattrs = {}
if attrs:
#TODO: treewalkers should always have attrs
for (namespace,name),value in attrs.iteritems():
namespace = unicode(namespace) if namespace else None
name = unicode(name)
value = unicode(value)
newattrs[(namespace,name)] = value
return newattrs
def emptyTag(self, namespace, name, attrs, hasChildren=False):
yield {"type": "EmptyTag", "name": unicode(name),
"namespace":unicode(namespace),
"data": self.normalizeAttrs(attrs)}
if hasChildren:
yield self.error(_("Void element has children"))
def startTag(self, namespace, name, attrs):
return {"type": "StartTag",
"name": unicode(name),
"namespace":unicode(namespace),
"data": self.normalizeAttrs(attrs)}
def endTag(self, namespace, name):
return {"type": "EndTag",
"name": unicode(name),
"namespace":unicode(namespace),
"data": {}}
def text(self, data):
data = unicode(data)
middle = data.lstrip(spaceCharacters)
left = data[:len(data)-len(middle)]
if left:
yield {"type": "SpaceCharacters", "data": left}
data = middle
middle = data.rstrip(spaceCharacters)
right = data[len(middle):]
if middle:
yield {"type": "Characters", "data": middle}
if right:
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
return {"type": "Comment", "data": unicode(data)}
def doctype(self, name, publicId=None, systemId=None, correct=True):
return {"type": "Doctype",
"name": name is not None and unicode(name) or u"",
"publicId": publicId,
"systemId": systemId,
"correct": correct}
def entity(self, name):
return {"type": "Entity", "name": unicode(name)}
def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType)
class RecursiveTreeWalker(TreeWalker):
def walkChildren(self, node):
raise NodeImplementedError
def element(self, node, namespace, name, attrs, hasChildren):
if name in voidElements:
for token in self.emptyTag(namespace, name, attrs, hasChildren):
yield token
else:
yield self.startTag(name, attrs)
if hasChildren:
for token in self.walkChildren(node):
yield token
yield self.endTag(name)
from xml.dom import Node
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
class NonRecursiveTreeWalker(TreeWalker):
def getNodeDetails(self, node):
raise NotImplementedError
def getFirstChild(self, node):
raise NotImplementedError
def getNextSibling(self, node):
raise NotImplementedError
def getParentNode(self, node):
raise NotImplementedError
def __iter__(self):
currentNode = self.tree
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
hasChildren = False
endTag = None
if type == DOCTYPE:
yield self.doctype(*details)
elif type == TEXT:
for token in self.text(*details):
yield token
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name in voidElements:
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
hasChildren = False
else:
endTag = name
yield self.startTag(namespace, name, attributes)
elif type == COMMENT:
yield self.comment(details[0])
elif type == ENTITY:
yield self.entity(details[0])
elif type == DOCUMENT:
hasChildren = True
else:
yield self.unknown(details[0])
if hasChildren:
firstChild = self.getFirstChild(currentNode)
else:
firstChild = None
if firstChild is not None:
currentNode = firstChild
else:
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
if type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name not in voidElements:
yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None
break
nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None:
currentNode = nextSibling
break
else:
currentNode = self.getParentNode(currentNode)

View File

@ -0,0 +1,41 @@
from xml.dom import Node
import gettext
_ = gettext.gettext
import _base
from html5lib.constants import voidElements
class TreeWalker(_base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
return _base.DOCTYPE, node.name, node.publicId, node.systemId
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
return _base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
attrs = {}
for attr in node.attributes.keys():
attr = node.getAttributeNode(attr)
attrs[(attr.namespaceURI,attr.localName)] = attr.value
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
attrs, node.hasChildNodes())
elif node.nodeType == Node.COMMENT_NODE:
return _base.COMMENT, node.nodeValue
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
return (_base.DOCUMENT,)
else:
return _base.UNKNOWN, node.nodeType
def getFirstChild(self, node):
return node.firstChild
def getNextSibling(self, node):
return node.nextSibling
def getParentNode(self, node):
return node.parentNode

View File

@ -0,0 +1,141 @@
import gettext
_ = gettext.gettext
try:
from types import ModuleType
except:
from new import module as ModuleType
import copy
import re
import _base
from html5lib.constants import voidElements
tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {}
def getETreeModule(ElementTreeImplementation):
name = "_" + ElementTreeImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder")
objs = getETreeBuilder(ElementTreeImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation
class TreeWalker(_base.NonRecursiveTreeWalker):
"""Given the particular ElementTree representation, this implementation,
to avoid using recursion, returns "nodes" as tuples with the following
content:
1. The current element
2. The index of the element relative to its parent
3. A stack of ancestor elements
4. A flag "text", "tail" or None to indicate if the current node is a
text node; either the text or tail of the current element (1)
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, key, parents, flag = node
if flag in ("text", "tail"):
return _base.TEXT, getattr(elt, flag)
else:
node = elt
if not(hasattr(node, "tag")):
node = node.getroot()
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
return (_base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>":
return (_base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
elif node.tag == ElementTree.Comment:
return _base.COMMENT, node.text
else:
assert type(node.tag) in (str, unicode), type(node.tag)
#This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
attrs = {}
for name, value in node.attrib.items():
match = tag_regexp.match(name)
if match:
attrs[(match.group(1),match.group(2))] = value
else:
attrs[(None,name)] = value
return (_base.ELEMENT, namespace, tag,
attrs, len(node) or node.text)
def getFirstChild(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
element, key, parents, flag = node, None, [], None
if flag in ("text", "tail"):
return None
else:
if element.text:
return element, key, parents, "text"
elif len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
def getNextSibling(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
else:
if element.tail and flag != "tail":
return element, key, parents, "tail"
elif key < len(parents[-1]) - 1:
return parents[-1][key+1], key+1, parents, None
else:
return None
def getParentNode(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if not parents:
return element
else:
return element, key, parents, None
else:
parent = parents.pop()
if not parents:
return parent
else:
return parent, list(parents[-1]).index(parent), parents, None
return locals()

View File

@ -0,0 +1,70 @@
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener
import _base
from html5lib.constants import voidElements
class TreeWalker(_base.TreeWalker):
def __iter__(self):
depth = 0
ignore_until = None
previous = None
for event in self.tree:
if previous is not None:
if previous[0] == START:
depth += 1
if ignore_until <= depth:
ignore_until = None
if ignore_until is None:
for token in self.tokens(previous, event):
yield token
if token["type"] == "EmptyTag":
ignore_until = depth
if previous[0] == END:
depth -= 1
previous = event
if previous is not None:
if ignore_until is None or ignore_until <= depth:
for token in self.tokens(previous, None):
yield token
elif ignore_until is not None:
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
def tokens(self, event, next):
kind, data, pos = event
if kind == START:
tag, attrib = data
name = tag.localname
namespace = tag.namespace
if tag in voidElements:
for token in self.emptyTag(namespace, name, list(attrib),
not next or next[0] != END
or next[1] != tag):
yield token
else:
yield self.startTag(namespace, name, list(attrib))
elif kind == END:
name = data.localname
namespace = data.namespace
if name not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT:
yield self.comment(data)
elif kind == TEXT:
for token in self.text(data):
yield token
elif kind == DOCTYPE:
yield self.doctype(*data)
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
START_CDATA, END_CDATA, PI):
pass
else:
yield self.unknown(kind)

View File

@ -0,0 +1,186 @@
from lxml import etree
from html5lib.treebuilders.etree import tag_regexp
from gettext import gettext
_ = gettext
import _base
from html5lib.constants import voidElements
from html5lib import ihatexml
class Root(object):
def __init__(self, et):
self.elementtree = et
self.children = []
if et.docinfo.internalDTD:
self.children.append(Doctype(self, et.docinfo.root_name,
et.docinfo.public_id,
et.docinfo.system_url))
root = et.getroot()
node = root
while node.getprevious() is not None:
node = node.getprevious()
while node is not None:
self.children.append(node)
node = node.getnext()
self.text = None
self.tail = None
def __getitem__(self, key):
return self.children[key]
def getnext(self):
return None
def __len__(self):
return 1
class Doctype(object):
def __init__(self, root_node, name, public_id, system_id):
self.root_node = root_node
self.name = name
self.public_id = public_id
self.system_id = system_id
self.text = None
self.tail = None
def getnext(self):
return self.root_node.children[1]
class FragmentRoot(Root):
def __init__(self, children):
self.children = [FragmentWrapper(self, child) for child in children]
self.text = self.tail = None
def getnext(self):
return None
class FragmentWrapper(object):
def __init__(self, fragment_root, obj):
self.root_node = fragment_root
self.obj = obj
if hasattr(self.obj, 'text'):
self.text = self.obj.text
else:
self.text = None
if hasattr(self.obj, 'tail'):
self.tail = self.obj.tail
else:
self.tail = None
self.isstring = isinstance(obj, basestring)
def __getattr__(self, name):
return getattr(self.obj, name)
def getnext(self):
siblings = self.root_node.children
idx = siblings.index(self)
if idx < len(siblings) - 1:
return siblings[idx + 1]
else:
return None
def __getitem__(self, key):
return self.obj[key]
def __nonzero__(self):
return bool(self.obj)
def getparent(self):
return None
def __str__(self):
return str(self.obj)
def __unicode__(self):
return unicode(self.obj)
def __len__(self):
return len(self.obj)
class TreeWalker(_base.NonRecursiveTreeWalker):
def __init__(self, tree):
if hasattr(tree, "getroot"):
tree = Root(tree)
elif isinstance(tree, list):
tree = FragmentRoot(tree)
_base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = ihatexml.InfosetFilter()
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
return _base.TEXT, getattr(node, key)
elif isinstance(node, Root):
return (_base.DOCUMENT,)
elif isinstance(node, Doctype):
return _base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and node.isstring:
return _base.TEXT, node
elif node.tag == etree.Comment:
return _base.COMMENT, node.text
elif node.tag == etree.Entity:
return _base.ENTITY, node.text[1:-1] # strip &;
else:
#This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
attrs = {}
for name, value in node.attrib.items():
match = tag_regexp.match(name)
if match:
attrs[(match.group(1),match.group(2))] = value
else:
attrs[(None,name)] = value
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node):
assert not isinstance(node, tuple), _("Text nodes have no children")
assert len(node) or node.text, "Node has no children"
if node.text:
return (node, "text")
else:
return node[0]
def getNextSibling(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text":
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
# because node[0] might evaluate to False if it has no child element
if len(node):
return node[0]
else:
return None
else: # tail
return node.getnext()
return node.tail and (node, "tail") or node.getnext()
def getParentNode(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text":
return node
# else: fallback to "normal" processing
return node.getparent()

View File

@ -0,0 +1,60 @@
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
import _base
from html5lib.constants import voidElements
class TreeWalker(_base.TreeWalker):
def __iter__(self):
ignore_until = None
previous = None
for event in self.tree:
if previous is not None and \
(ignore_until is None or previous[1] is ignore_until):
if previous[1] is ignore_until:
ignore_until = None
for token in self.tokens(previous, event):
yield token
if token["type"] == "EmptyTag":
ignore_until = previous[1]
previous = event
if ignore_until is None or previous[1] is ignore_until:
for token in self.tokens(previous, None):
yield token
elif ignore_until is not None:
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
def tokens(self, event, next):
type, node = event
if type == START_ELEMENT:
name = node.nodeName
namespace = node.namespaceURI
attrs = {}
for attr in node.attributes.keys():
attr = node.getAttributeNode(attr)
attrs[(attr.namespaceURI,attr.localName)] = attr.value
if name in voidElements:
for token in self.emptyTag(namespace,
name,
attrs,
not next or next[1] is not node):
yield token
else:
yield self.startTag(namespace, name, attrs)
elif type == END_ELEMENT:
name = node.nodeName
namespace = node.namespaceURI
if name not in voidElements:
yield self.endTag(namespace, name)
elif type == COMMENT:
yield self.comment(node.nodeValue)
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
for token in self.text(node.nodeValue):
yield token
else:
yield self.unknown(type)

View File

@ -0,0 +1,78 @@
import gettext
_ = gettext.gettext
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
"""Given that simpletree has no performant way of getting a node's
next sibling, this implementation returns "nodes" as tuples with the
following content:
1. The parent Node (Element, Document or DocumentFragment)
2. The child index of the current node in its parent's children list
3. A list used as a stack of all ancestors. It is a pair tuple whose
first item is a parent Node and second item is a child index.
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Node
parent, idx, parents = node
node = parent.childNodes[idx]
# testing node.type allows us not to import treebuilders.simpletree
if node.type in (1, 2): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif node.type == 3: # DocumentType
return _base.DOCTYPE, node.name, node.publicId, node.systemId
elif node.type == 4: # TextNode
return _base.TEXT, node.value
elif node.type == 5: # Element
attrs = {}
for name, value in node.attributes.items():
if isinstance(name, tuple):
attrs[(name[2],name[1])] = value
else:
attrs[(None,name)] = value
return (_base.ELEMENT, node.namespace, node.name,
attrs, node.hasContent())
elif node.type == 6: # CommentNode
return _base.COMMENT, node.data
else:
return _node.UNKNOWN, node.type
def getFirstChild(self, node):
if isinstance(node, tuple): # It might be the root Node
parent, idx, parents = node
parents.append((parent, idx))
node = parent.childNodes[idx]
else:
parents = []
assert node.hasContent(), "Node has no children"
return (node, 0, parents)
def getNextSibling(self, node):
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
parent, idx, parents = node
idx += 1
if len(parent.childNodes) > idx:
return (parent, idx, parents)
else:
return None
def getParentNode(self, node):
assert isinstance(node, tuple)
parent, idx, parents = node
if parents:
parent, idx = parents.pop()
return parent, idx, parents
else:
# HACK: We could return ``parent`` but None will stop the algorithm the same way
return None

View File

@ -0,0 +1,60 @@
import re
import gettext
_ = gettext.gettext
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
from html5lib.constants import namespaces
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
doctype_regexp = re.compile(
r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif isinstance(node, Declaration): # DocumentType
string = unicode(node.string)
#Slice needed to remove markup added during unicode conversion,
#but only in some versions of BeautifulSoup/Python
if string.startswith('<!') and string.endswith('>'):
string = string[2:-1]
m = self.doctype_regexp.match(string)
#This regexp approach seems wrong and fragile
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
#been modified at all
#We could just feed to it a html5lib tokenizer, I guess...
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1')
else:
systemId = m.group('systemId2')
return _base.DOCTYPE, name, publicId or "", systemId or ""
elif isinstance(node, Comment):
string = unicode(node.string)
if string.startswith('<!--') and string.endswith('-->'):
string = string[4:-3]
return _base.COMMENT, string
elif isinstance(node, unicode): # TextNode
return _base.TEXT, node
elif isinstance(node, Tag): # Element
return (_base.ELEMENT, namespaces["html"], node.name,
dict(node.attrs).items(), node.contents)
else:
return _base.UNKNOWN, node.__class__.__name__
def getFirstChild(self, node):
return node.contents[0]
def getNextSibling(self, node):
return node.nextSibling
def getParentNode(self, node):
return node.parent

View File

@ -0,0 +1,175 @@
try:
frozenset
except NameError:
#Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
class MethodDispatcher(dict):
"""Dict with 2 special properties:
On initiation, keys that are lists, sets or tuples are converted to
multiple keys so accessing any one of the items in the original
list-like object returns the matching value
md = MethodDispatcher({("foo", "bar"):"baz"})
md["foo"] == "baz"
A default value which can be set through the default attribute.
"""
def __init__(self, items=()):
# Using _dictEntries instead of directly assigning to self is about
# twice as fast. Please do careful performance testing before changing
# anything here.
_dictEntries = []
for name,value in items:
if type(name) in (list, tuple, frozenset, set):
for item in name:
_dictEntries.append((item, value))
else:
_dictEntries.append((name, value))
dict.__init__(self, _dictEntries)
self.default = None
def __getitem__(self, key):
return dict.get(self, key, self.default)
#Pure python implementation of deque taken from the ASPN Python Cookbook
#Original code by Raymond Hettinger
class deque(object):
def __init__(self, iterable=(), maxsize=-1):
if not hasattr(self, 'data'):
self.left = self.right = 0
self.data = {}
self.maxsize = maxsize
self.extend(iterable)
def append(self, x):
self.data[self.right] = x
self.right += 1
if self.maxsize != -1 and len(self) > self.maxsize:
self.popleft()
def appendleft(self, x):
self.left -= 1
self.data[self.left] = x
if self.maxsize != -1 and len(self) > self.maxsize:
self.pop()
def pop(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
self.right -= 1
elem = self.data[self.right]
del self.data[self.right]
return elem
def popleft(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
elem = self.data[self.left]
del self.data[self.left]
self.left += 1
return elem
def clear(self):
self.data.clear()
self.left = self.right = 0
def extend(self, iterable):
for elem in iterable:
self.append(elem)
def extendleft(self, iterable):
for elem in iterable:
self.appendleft(elem)
def rotate(self, n=1):
if self:
n %= len(self)
for i in xrange(n):
self.appendleft(self.pop())
def __getitem__(self, i):
if i < 0:
i += len(self)
try:
return self.data[i + self.left]
except KeyError:
raise IndexError
def __setitem__(self, i, value):
if i < 0:
i += len(self)
try:
self.data[i + self.left] = value
except KeyError:
raise IndexError
def __delitem__(self, i):
size = len(self)
if not (-size <= i < size):
raise IndexError
data = self.data
if i < 0:
i += size
for j in xrange(self.left+i, self.right-1):
data[j] = data[j+1]
self.pop()
def __len__(self):
return self.right - self.left
def __cmp__(self, other):
if type(self) != type(other):
return cmp(type(self), type(other))
return cmp(list(self), list(other))
def __repr__(self, _track=[]):
if id(self) in _track:
return '...'
_track.append(id(self))
r = 'deque(%r)' % (list(self),)
_track.remove(id(self))
return r
def __getstate__(self):
return (tuple(self),)
def __setstate__(self, s):
self.__init__(s[0])
def __hash__(self):
raise TypeError
def __copy__(self):
return self.__class__(self)
def __deepcopy__(self, memo={}):
from copy import deepcopy
result = self.__class__()
memo[id(self)] = result
result.__init__(deepcopy(tuple(self), memo))
return result
#Some utility functions to dal with weirdness around UCS2 vs UCS4
#python builds
def encodingType():
if len() == 2:
return "UCS2"
else:
return "UCS4"
def isSurrogatePair(data):
return (len(data) == 2 and
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
def surrogatePairToCodepoint(data):
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
(ord(data[1]) - 0xDC00))
return char_val

View File

@ -5,6 +5,8 @@ import functools
import re
import math
import random
import urllib2
from xml.etree import ElementTree
try:
# For Python < 2.6 or people using a newer version of simplejson
@ -19,13 +21,14 @@ from sqlobject import sqlhub, connectionForURI, AND, OR, IN, SQLObjectNotFound
from sqlobject.dberrors import OperationalError
from pysolr import Solr
import iso8601
import html5lib
from stackdump.models import Site, Badge, Comment, User
# STATIC VARIABLES
BOTTLE_ROOT = os.path.abspath(os.path.dirname(sys.argv[0]))
MEDIA_ROOT = os.path.abspath(BOTTLE_ROOT + '/../../media')
SE_QUESTION_ID_RE = re.compile(r'/questions/(?P<id>\d+)/')
# THREAD LOCAL VARIABLES
thread_locals = threading.local()
@ -307,6 +310,7 @@ def view_question(site_key, question_id):
retrieve_sites(results)
result = results.docs[0]
rewrite_result(result)
sort_answers(result)
context['result'] = result
@ -627,6 +631,81 @@ def sort_answers(result):
answers.sort(comparison_function, reverse=True)
def _rewrite_html(html, app_url_root, sites_by_urls):
# wrap the given HTML fragment in an element so it looks like a document.
html = '<html>%s</html>' % html
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder('etree'))
html = parser.parse(html)
# rewrite img URLs
for t in html.iter('{http://www.w3.org/1999/xhtml}img'):
if t.get('src', None):
t.set('title', 'Original URL: %s' % t.get('src'))
t.set('src', '%smedia/images/img_placeholder.png' % app_url_root)
# rewrite link URLs
for t in html.iter('{http://www.w3.org/1999/xhtml}a'):
internal_link = False
url = t.get('href', None)
if url:
host = urllib2.Request(url).get_host()
site = sites_by_urls.get(host, None)
if site:
# rewrite this URL for stackdump
question_id = SE_QUESTION_ID_RE.search(url)
if question_id:
question_id = question_id.groupdict()['id']
url = '%s%s/%s' % (app_url_root, site.key, question_id)
t.set('href', url)
t.set('class', t.get('class', '') + ' internal-link')
internal_link = True
if not internal_link:
t.set('class', t.get('class', '') + ' external-link')
# get a string back
# this is used instead of ElementTree.tostring because that returns HTML
# with namespaces to conform to XML.
walker = html5lib.treewalkers.getTreeWalker('etree', implementation=ElementTree)
stream = walker(html)
serializer = html5lib.serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True,
minimize_boolean_attributes=False)
output_generator = serializer.serialize(stream)
return ''.join(output_generator)
def rewrite_result(result):
'''\
Rewrites the HTML in this result (question, answers and comments) so
links to other StackExchange sites that exist in Stackdump are rewritten,
links elsewhere are decorated with a CSS class, and all images are replaced
with a placeholder.
The JSON must have been decoded first.
'''
app_url_root = settings.get('APP_URL_ROOT', '/')
# get a list of all the site base URLs
sites = list(Site.select())
sites_by_urls = dict([ (s.base_url, s) for s in sites ])
# rewrite question
question = result.get('question')
if question:
question['body'] = _rewrite_html(question.get('body'), app_url_root, sites_by_urls)
# TODO comments
# rewrite answers
answers = result.get('answers')
if answers:
for a in answers:
a['body'] = _rewrite_html(a.get('body'), app_url_root, sites_by_urls)
# TODO: comments
# END VIEW HELPERS
# INITIALISATION

View File

@ -10,6 +10,7 @@ import time
import xml.sax
from datetime import datetime
import re
import urllib2
from optparse import OptionParser
from xml.etree import ElementTree
@ -529,6 +530,7 @@ parser.add_option('-n', '--site-name', help='Name of the site.')
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
(cmd_options, cmd_args) = parser.parse_args()
@ -591,7 +593,8 @@ if not (site_name and dump_date):
# look for the site in the sites RSS file
site_desc = cmd_options.site_desc
site_key = cmd_options.site_key
if not (site_desc and site_key):
site_base_url = cmd_options.base_url
if not (site_desc and site_key and site_base_url):
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
if os.path.exists(sites_file_path):
with open(sites_file_path) as f:
@ -612,9 +615,15 @@ if not (site_desc and site_key):
site_key = site_key[:-len('.stackexchange')]
site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
site_base_url = entry.find('{http://www.w3.org/2005/Atom}id').text.strip()
print 'Name: %s\nKey: %s\nDesc: %s\nDump Date: %s\n' % (site_name, site_key, site_desc, dump_date)
# scrub the URL scheme off the base_url
if site_base_url:
site_base_url = urllib2.Request(site_base_url).get_host()
print 'Name: %s\nKey: %s\nDescription: %s\nDump Date: %s\nBase URL: %s\n' % (site_name, site_key, site_desc, dump_date, site_base_url)
# the base URL is optional.
if not (site_name and site_key and site_desc and dump_date):
print 'Could not get all the details for the site.'
print 'Use command-line parameters to specify the missing details (listed as None).'
@ -660,7 +669,8 @@ sqlhub.threadConnection = sqlhub.processConnection.transaction()
conn = sqlhub.threadConnection
# create a new Site
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date, import_date=datetime.now())
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date,
import_date=datetime.now(), base_url=site_base_url)
# BADGES
# Processing of badges has been disabled because they don't offer any useful

View File

@ -11,6 +11,7 @@ class Site(SQLObject):
key = UnicodeCol()
dump_date = UnicodeCol()
import_date = DateTimeCol()
base_url = UnicodeCol()
siteKey_index = DatabaseIndex(key, unique=True)