stackdump/python/packages/markdown/preprocessors.py

"""
PRE-PROCESSORS
=============================================================================

Preprocessors work on source text before we start doing anything too
complicated. 
"""

import re
import util
import odict


def build_preprocessors(md_instance, **kwargs):
    """ Build the default set of preprocessors used by Markdown. """
    preprocessors = odict.OrderedDict()
    if md_instance.safeMode != 'escape':
        preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
    preprocessors["reference"] = ReferencePreprocessor(md_instance)
    return preprocessors


class Preprocessor(util.Processor):
    """
    Preprocessors are run after the text is broken into lines.

    Each preprocessor implements a "run" method that takes a pointer to a
    list of lines of the document, modifies it as necessary and returns
    either the same pointer or a pointer to a new list.

    Preprocessors must extend markdown.Preprocessor.

    """
    def run(self, lines):
        """
        Each subclass of Preprocessor should override the `run` method, which
        takes the document as a list of strings split by newlines and returns
        the (possibly modified) list of lines.

        """
        pass


class HtmlBlockPreprocessor(Preprocessor):
    """Remove html blocks from the text and store them for later retrieval."""

    right_tag_patterns = ["</%s>", "%s>"]
    attrs_pattern = r"""
        \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
        |                                                         # OR 
        \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)               # attr=value
        |                                                         # OR
        \s+(?P<attr2>[^>"'/= ]+)                                  # attr
        """
    left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
    attrs_re = re.compile(attrs_pattern, re.VERBOSE)
    left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
    markdown_in_raw = False

    def _get_left_tag(self, block):
        m = self.left_tag_re.match(block)
        if m:
            tag = m.group('tag')
            raw_attrs = m.group('attrs')
            attrs = {}
            if raw_attrs:
                for ma in self.attrs_re.finditer(raw_attrs):
                    if ma.group('attr'):
                        if ma.group('value'):
                            attrs[ma.group('attr').strip()] = ma.group('value')
                        else:
                            attrs[ma.group('attr').strip()] = ""
                    elif ma.group('attr1'):
                        if ma.group('value1'):
                            attrs[ma.group('attr1').strip()] = ma.group('value1')
                        else:
                            attrs[ma.group('attr1').strip()] = ""
                    elif ma.group('attr2'):
                        attrs[ma.group('attr2').strip()] = ""
            return tag, len(m.group(0)), attrs
        else:
            tag = block[1:].split(">", 1)[0].lower()
            return tag, len(tag)+2, {}

    def _recursive_tagfind(self, ltag, rtag, start_index, block):
        while 1:
            i = block.find(rtag, start_index)
            if i == -1:
                return -1
            j = block.find(ltag, start_index) 
            # if no ltag, or rtag found before another ltag, return index
            if (j > i or j == -1):
                return i + len(rtag)
            # another ltag found before rtag, use end of ltag as starting
            # point and search again
            j = block.find('>', j)
            start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
            if start_index == -1:
                # HTML potentially malformed- ltag has no corresponding 
                # rtag
                return -1

    def _get_right_tag(self, left_tag, left_index, block):
        for p in self.right_tag_patterns:
            tag = p % left_tag
            i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
            if i > 2:
                return tag.lstrip("<").rstrip(">"), i
        return block.rstrip()[-left_index:-1].lower(), len(block)
    
    def _equal_tags(self, left_tag, right_tag):
        if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
            return True
        if ("/" + left_tag) == right_tag:
            return True
        if (right_tag == "--" and left_tag == "--"):
            return True
        elif left_tag == right_tag[1:] \
            and right_tag[0] == "/":
            return True
        else:
            return False

    def _is_oneliner(self, tag):
        return (tag in ['hr', 'hr/'])

    def run(self, lines):
        text = "\n".join(lines)
        new_blocks = []
        text = text.split("\n\n")
        items = []
        left_tag = ''
        right_tag = ''
        in_tag = False # flag

        while text:
            block = text[0]
            if block.startswith("\n"):
                block = block[1:]
            text = text[1:]

            if block.startswith("\n"):
                block = block[1:]

            if not in_tag:
                if block.startswith("<") and len(block.strip()) > 1:

                    if block[1] == "!":
                        # is a comment block
                        left_tag, left_index, attrs  = "--", 2, {}
                    else:
                        left_tag, left_index, attrs = self._get_left_tag(block)
                    right_tag, data_index = self._get_right_tag(left_tag, 
                                                                left_index,
                                                                block)
                    # keep checking conditions below and maybe just append
                    
                    if data_index < len(block) \
                        and (util.isBlockLevel(left_tag)
                        or left_tag == '--'): 
                        text.insert(0, block[data_index:])
                        block = block[:data_index]

                    if not (util.isBlockLevel(left_tag) \
                        or block[1] in ["!", "?", "@", "%"]):
                        new_blocks.append(block)
                        continue

                    if self._is_oneliner(left_tag):
                        new_blocks.append(block.strip())
                        continue

                    if block.rstrip().endswith(">") \
                        and self._equal_tags(left_tag, right_tag):
                        if self.markdown_in_raw and 'markdown' in attrs.keys():
                            start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
                                           '', block[:left_index])
                            end = block[-len(right_tag)-2:]
                            block = block[left_index:-len(right_tag)-2]
                            new_blocks.append(
                                self.markdown.htmlStash.store(start))
                            new_blocks.append(block)
                            new_blocks.append(
                                self.markdown.htmlStash.store(end))
                        else:
                            new_blocks.append(
                                self.markdown.htmlStash.store(block.strip()))
                        continue
                    else: 
                        # if is block level tag and is not complete

                        if util.isBlockLevel(left_tag) or left_tag == "--" \
                            and not block.rstrip().endswith(">"):
                            items.append(block.strip())
                            in_tag = True
                        else:
                            new_blocks.append(
                            self.markdown.htmlStash.store(block.strip()))

                        continue

                new_blocks.append(block)

            else:
                items.append(block)

                right_tag, data_index = self._get_right_tag(left_tag, 0, block)

                if self._equal_tags(left_tag, right_tag):
                    # if find closing tag
                    
                    if data_index < len(block):
                        # we have more text after right_tag
                        items[-1] = block[:data_index]
                        text.insert(0, block[data_index:])

                    in_tag = False
                    if self.markdown_in_raw and 'markdown' in attrs.keys():
                        start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
                                       '', items[0][:left_index])
                        items[0] = items[0][left_index:]
                        end = items[-1][-len(right_tag)-2:]
                        items[-1] = items[-1][:-len(right_tag)-2]
                        new_blocks.append(
                            self.markdown.htmlStash.store(start))
                        new_blocks.extend(items)
                        new_blocks.append(
                            self.markdown.htmlStash.store(end))
                    else:
                        new_blocks.append(
                            self.markdown.htmlStash.store('\n\n'.join(items)))
                    items = []

        if items:
            if self.markdown_in_raw and 'markdown' in attrs.keys():
                start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
                               '', items[0][:left_index])
                items[0] = items[0][left_index:]
                end = items[-1][-len(right_tag)-2:]
                items[-1] = items[-1][:-len(right_tag)-2]
                new_blocks.append(
                    self.markdown.htmlStash.store(start))
                new_blocks.extend(items)
                if end.strip():
                    new_blocks.append(
                        self.markdown.htmlStash.store(end))
            else:
                new_blocks.append(
                    self.markdown.htmlStash.store('\n\n'.join(items)))
            #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
            new_blocks.append('\n')

        new_text = "\n\n".join(new_blocks)
        return new_text.split("\n")


class ReferencePreprocessor(Preprocessor):
    """ Remove reference definitions from text and store for later use. """

    TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
    RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
    TITLE_RE = re.compile(r'^%s$' % TITLE)

    def run (self, lines):
        new_text = [];
        while lines:
            line = lines.pop(0)
            m = self.RE.match(line)
            if m:
                id = m.group(1).strip().lower()
                link = m.group(2).lstrip('<').rstrip('>')
                t = m.group(5) or m.group(6) or m.group(7)
                if not t:
                    # Check next line for title
                    tm = self.TITLE_RE.match(lines[0])
                    if tm:
                        lines.pop(0)
                        t = tm.group(2) or tm.group(3) or tm.group(4)
                self.markdown.references[id] = (link, t)
            else:
                new_text.append(line)

        return new_text #+ "\n"
Added PowerShell equivalents to launch and manage Stackdump on Windows. 2013-11-28 10:53:45 +00:00			`"""`
			`PRE-PROCESSORS`
			`=============================================================================`

			`Preprocessors work on source text before we start doing anything too`
			`complicated.`
			`"""`

			`import re`
			`import util`
			`import odict`


			`def build_preprocessors(md_instance, **kwargs):`
			`""" Build the default set of preprocessors used by Markdown. """`
			`preprocessors = odict.OrderedDict()`
			`if md_instance.safeMode != 'escape':`
			`preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)`
			`preprocessors["reference"] = ReferencePreprocessor(md_instance)`
			`return preprocessors`


			`class Preprocessor(util.Processor):`
			`"""`
			`Preprocessors are run after the text is broken into lines.`

			`Each preprocessor implements a "run" method that takes a pointer to a`
			`list of lines of the document, modifies it as necessary and returns`
			`either the same pointer or a pointer to a new list.`

			`Preprocessors must extend markdown.Preprocessor.`

			`"""`
			`def run(self, lines):`
			`"""`
			Each subclass of Preprocessor should override the `run` method, which
			`takes the document as a list of strings split by newlines and returns`
			`the (possibly modified) list of lines.`

			`"""`
			`pass`


			`class HtmlBlockPreprocessor(Preprocessor):`
			`"""Remove html blocks from the text and store them for later retrieval."""`

			`right_tag_patterns = ["</%s>", "%s>"]`
			`attrs_pattern = r"""`
			`\s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"`
			`\| # OR`
			`\s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value`
			`\| # OR`
			`\s+(?P<attr2>[^>"'/= ]+) # attr`
			`"""`
			`left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s))\s\/?\>?' % attrs_pattern`
			`attrs_re = re.compile(attrs_pattern, re.VERBOSE)`
			`left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)`
			`markdown_in_raw = False`

			`def _get_left_tag(self, block):`
			`m = self.left_tag_re.match(block)`
			`if m:`
			`tag = m.group('tag')`
			`raw_attrs = m.group('attrs')`
			`attrs = {}`
			`if raw_attrs:`
			`for ma in self.attrs_re.finditer(raw_attrs):`
			`if ma.group('attr'):`
			`if ma.group('value'):`
			`attrs[ma.group('attr').strip()] = ma.group('value')`
			`else:`
			`attrs[ma.group('attr').strip()] = ""`
			`elif ma.group('attr1'):`
			`if ma.group('value1'):`
			`attrs[ma.group('attr1').strip()] = ma.group('value1')`
			`else:`
			`attrs[ma.group('attr1').strip()] = ""`
			`elif ma.group('attr2'):`
			`attrs[ma.group('attr2').strip()] = ""`
			`return tag, len(m.group(0)), attrs`
			`else:`
			`tag = block[1:].split(">", 1)[0].lower()`
			`return tag, len(tag)+2, {}`

			`def _recursive_tagfind(self, ltag, rtag, start_index, block):`
			`while 1:`
			`i = block.find(rtag, start_index)`
			`if i == -1:`
			`return -1`
			`j = block.find(ltag, start_index)`
			`# if no ltag, or rtag found before another ltag, return index`
			`if (j > i or j == -1):`
			`return i + len(rtag)`
			`# another ltag found before rtag, use end of ltag as starting`
			`# point and search again`
			`j = block.find('>', j)`
			`start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)`
			`if start_index == -1:`
			`# HTML potentially malformed- ltag has no corresponding`
			`# rtag`
			`return -1`

			`def _get_right_tag(self, left_tag, left_index, block):`
			`for p in self.right_tag_patterns:`
			`tag = p % left_tag`
			`i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)`
			`if i > 2:`
			`return tag.lstrip("<").rstrip(">"), i`
			`return block.rstrip()[-left_index:-1].lower(), len(block)`

			`def _equal_tags(self, left_tag, right_tag):`
			`if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.`
			`return True`
			`if ("/" + left_tag) == right_tag:`
			`return True`
			`if (right_tag == "--" and left_tag == "--"):`
			`return True`
			`elif left_tag == right_tag[1:] \`
			`and right_tag[0] == "/":`
			`return True`
			`else:`
			`return False`

			`def _is_oneliner(self, tag):`
			`return (tag in ['hr', 'hr/'])`

			`def run(self, lines):`
			`text = "\n".join(lines)`
			`new_blocks = []`
			`text = text.split("\n\n")`
			`items = []`
			`left_tag = ''`
			`right_tag = ''`
			`in_tag = False # flag`

			`while text:`
			`block = text[0]`
			`if block.startswith("\n"):`
			`block = block[1:]`
			`text = text[1:]`

			`if block.startswith("\n"):`
			`block = block[1:]`

			`if not in_tag:`
			`if block.startswith("<") and len(block.strip()) > 1:`

			`if block[1] == "!":`
			`# is a comment block`
			`left_tag, left_index, attrs = "--", 2, {}`
			`else:`
			`left_tag, left_index, attrs = self._get_left_tag(block)`
			`right_tag, data_index = self._get_right_tag(left_tag,`
			`left_index,`
			`block)`
			`# keep checking conditions below and maybe just append`

			`if data_index < len(block) \`
			`and (util.isBlockLevel(left_tag)`
			`or left_tag == '--'):`
			`text.insert(0, block[data_index:])`
			`block = block[:data_index]`

			`if not (util.isBlockLevel(left_tag) \`
			`or block[1] in ["!", "?", "@", "%"]):`
			`new_blocks.append(block)`
			`continue`

			`if self._is_oneliner(left_tag):`
			`new_blocks.append(block.strip())`
			`continue`

			`if block.rstrip().endswith(">") \`
			`and self._equal_tags(left_tag, right_tag):`
			`if self.markdown_in_raw and 'markdown' in attrs.keys():`
			`start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',`
			`'', block[:left_index])`
			`end = block[-len(right_tag)-2:]`
			`block = block[left_index:-len(right_tag)-2]`
			`new_blocks.append(`
			`self.markdown.htmlStash.store(start))`
			`new_blocks.append(block)`
			`new_blocks.append(`
			`self.markdown.htmlStash.store(end))`
			`else:`
			`new_blocks.append(`
			`self.markdown.htmlStash.store(block.strip()))`
			`continue`
			`else:`
			`# if is block level tag and is not complete`

			`if util.isBlockLevel(left_tag) or left_tag == "--" \`
			`and not block.rstrip().endswith(">"):`
			`items.append(block.strip())`
			`in_tag = True`
			`else:`
			`new_blocks.append(`
			`self.markdown.htmlStash.store(block.strip()))`

			`continue`

			`new_blocks.append(block)`

			`else:`
			`items.append(block)`

			`right_tag, data_index = self._get_right_tag(left_tag, 0, block)`

			`if self._equal_tags(left_tag, right_tag):`
			`# if find closing tag`

			`if data_index < len(block):`
			`# we have more text after right_tag`
			`items[-1] = block[:data_index]`
			`text.insert(0, block[data_index:])`

			`in_tag = False`
			`if self.markdown_in_raw and 'markdown' in attrs.keys():`
			`start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',`
			`'', items[0][:left_index])`
			`items[0] = items[0][left_index:]`
			`end = items[-1][-len(right_tag)-2:]`
			`items[-1] = items[-1][:-len(right_tag)-2]`
			`new_blocks.append(`
			`self.markdown.htmlStash.store(start))`
			`new_blocks.extend(items)`
			`new_blocks.append(`
			`self.markdown.htmlStash.store(end))`
			`else:`
			`new_blocks.append(`
			`self.markdown.htmlStash.store('\n\n'.join(items)))`
			`items = []`

			`if items:`
			`if self.markdown_in_raw and 'markdown' in attrs.keys():`
			`start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',`
			`'', items[0][:left_index])`
			`items[0] = items[0][left_index:]`
			`end = items[-1][-len(right_tag)-2:]`
			`items[-1] = items[-1][:-len(right_tag)-2]`
			`new_blocks.append(`
			`self.markdown.htmlStash.store(start))`
			`new_blocks.extend(items)`
			`if end.strip():`
			`new_blocks.append(`
			`self.markdown.htmlStash.store(end))`
			`else:`
			`new_blocks.append(`
			`self.markdown.htmlStash.store('\n\n'.join(items)))`
			`#new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))`
			`new_blocks.append('\n')`

			`new_text = "\n\n".join(new_blocks)`
			`return new_text.split("\n")`


			`class ReferencePreprocessor(Preprocessor):`
			`""" Remove reference definitions from text and store for later use. """`

			`TITLE = r'[ ](\"(.)\"\|\'(.)\'\|\((.)\))[ ]*'`
			`RE = re.compile(r'^[ ]{0,3}\[([^\]])\]:\s([^ ])[ ](%s)?$' % TITLE, re.DOTALL)`
			`TITLE_RE = re.compile(r'^%s$' % TITLE)`

			`def run (self, lines):`
			`new_text = [];`
			`while lines:`
			`line = lines.pop(0)`
			`m = self.RE.match(line)`
			`if m:`
			`id = m.group(1).strip().lower()`
			`link = m.group(2).lstrip('<').rstrip('>')`
			`t = m.group(5) or m.group(6) or m.group(7)`
			`if not t:`
			`# Check next line for title`
			`tm = self.TITLE_RE.match(lines[0])`
			`if tm:`
			`lines.pop(0)`
			`t = tm.group(2) or tm.group(3) or tm.group(4)`
			`self.markdown.references[id] = (link, t)`
			`else:`
			`new_text.append(line)`

			`return new_text #+ "\n"`