Added markdown parsing for comments so links in comments now appear properly.

Also rewrote part of the HTML rewriting code so it doesn't introduce an additional wrapping element in the output which was added due to a html5lib requirements on input.
2025-12-18 05:43:26 +00:00 · 2012-12-15 21:43:06 +11:00
parent 5ac8492f38
commit 993bee4fc1
33 changed files with 5217 additions and 10 deletions
--- a/python/packages/markdown/preprocessors.py
+++ b/python/packages/markdown/preprocessors.py
@@ -0,0 +1,283 @@
+"""
+PRE-PROCESSORS
+=============================================================================
+
+Preprocessors work on source text before we start doing anything too
+complicated. 
+"""
+
+import re
+import util
+import odict
+
+
+def build_preprocessors(md_instance, **kwargs):
+    """ Build the default set of preprocessors used by Markdown. """
+    preprocessors = odict.OrderedDict()
+    if md_instance.safeMode != 'escape':
+        preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
+    preprocessors["reference"] = ReferencePreprocessor(md_instance)
+    return preprocessors
+
+
+class Preprocessor(util.Processor):
+    """
+    Preprocessors are run after the text is broken into lines.
+
+    Each preprocessor implements a "run" method that takes a pointer to a
+    list of lines of the document, modifies it as necessary and returns
+    either the same pointer or a pointer to a new list.
+
+    Preprocessors must extend markdown.Preprocessor.
+
+    """
+    def run(self, lines):
+        """
+        Each subclass of Preprocessor should override the `run` method, which
+        takes the document as a list of strings split by newlines and returns
+        the (possibly modified) list of lines.
+
+        """
+        pass
+
+
+class HtmlBlockPreprocessor(Preprocessor):
+    """Remove html blocks from the text and store them for later retrieval."""
+
+    right_tag_patterns = ["</%s>", "%s>"]
+    attrs_pattern = r"""
+        \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
+        |                                                         # OR 
+        \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)               # attr=value
+        |                                                         # OR
+        \s+(?P<attr2>[^>"'/= ]+)                                  # attr
+        """
+    left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
+    attrs_re = re.compile(attrs_pattern, re.VERBOSE)
+    left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
+    markdown_in_raw = False
+
+    def _get_left_tag(self, block):
+        m = self.left_tag_re.match(block)
+        if m:
+            tag = m.group('tag')
+            raw_attrs = m.group('attrs')
+            attrs = {}
+            if raw_attrs:
+                for ma in self.attrs_re.finditer(raw_attrs):
+                    if ma.group('attr'):
+                        if ma.group('value'):
+                            attrs[ma.group('attr').strip()] = ma.group('value')
+                        else:
+                            attrs[ma.group('attr').strip()] = ""
+                    elif ma.group('attr1'):
+                        if ma.group('value1'):
+                            attrs[ma.group('attr1').strip()] = ma.group('value1')
+                        else:
+                            attrs[ma.group('attr1').strip()] = ""
+                    elif ma.group('attr2'):
+                        attrs[ma.group('attr2').strip()] = ""
+            return tag, len(m.group(0)), attrs
+        else:
+            tag = block[1:].split(">", 1)[0].lower()
+            return tag, len(tag)+2, {}
+
+    def _recursive_tagfind(self, ltag, rtag, start_index, block):
+        while 1:
+            i = block.find(rtag, start_index)
+            if i == -1:
+                return -1
+            j = block.find(ltag, start_index) 
+            # if no ltag, or rtag found before another ltag, return index
+            if (j > i or j == -1):
+                return i + len(rtag)
+            # another ltag found before rtag, use end of ltag as starting
+            # point and search again
+            j = block.find('>', j)
+            start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
+            if start_index == -1:
+                # HTML potentially malformed- ltag has no corresponding 
+                # rtag
+                return -1
+
+    def _get_right_tag(self, left_tag, left_index, block):
+        for p in self.right_tag_patterns:
+            tag = p % left_tag
+            i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
+            if i > 2:
+                return tag.lstrip("<").rstrip(">"), i
+        return block.rstrip()[-left_index:-1].lower(), len(block)
+    
+    def _equal_tags(self, left_tag, right_tag):
+        if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
+            return True
+        if ("/" + left_tag) == right_tag:
+            return True
+        if (right_tag == "--" and left_tag == "--"):
+            return True
+        elif left_tag == right_tag[1:] \
+            and right_tag[0] == "/":
+            return True
+        else:
+            return False
+
+    def _is_oneliner(self, tag):
+        return (tag in ['hr', 'hr/'])
+
+    def run(self, lines):
+        text = "\n".join(lines)
+        new_blocks = []
+        text = text.split("\n\n")
+        items = []
+        left_tag = ''
+        right_tag = ''
+        in_tag = False # flag
+
+        while text:
+            block = text[0]
+            if block.startswith("\n"):
+                block = block[1:]
+            text = text[1:]
+
+            if block.startswith("\n"):
+                block = block[1:]
+
+            if not in_tag:
+                if block.startswith("<") and len(block.strip()) > 1:
+
+                    if block[1] == "!":
+                        # is a comment block
+                        left_tag, left_index, attrs  = "--", 2, {}
+                    else:
+                        left_tag, left_index, attrs = self._get_left_tag(block)
+                    right_tag, data_index = self._get_right_tag(left_tag, 
+                                                                left_index,
+                                                                block)
+                    # keep checking conditions below and maybe just append
+                    
+                    if data_index < len(block) \
+                        and (util.isBlockLevel(left_tag)
+                        or left_tag == '--'): 
+                        text.insert(0, block[data_index:])
+                        block = block[:data_index]
+
+                    if not (util.isBlockLevel(left_tag) \
+                        or block[1] in ["!", "?", "@", "%"]):
+                        new_blocks.append(block)
+                        continue
+
+                    if self._is_oneliner(left_tag):
+                        new_blocks.append(block.strip())
+                        continue
+
+                    if block.rstrip().endswith(">") \
+                        and self._equal_tags(left_tag, right_tag):
+                        if self.markdown_in_raw and 'markdown' in attrs.keys():
+                            start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
+                                           '', block[:left_index])
+                            end = block[-len(right_tag)-2:]
+                            block = block[left_index:-len(right_tag)-2]
+                            new_blocks.append(
+                                self.markdown.htmlStash.store(start))
+                            new_blocks.append(block)
+                            new_blocks.append(
+                                self.markdown.htmlStash.store(end))
+                        else:
+                            new_blocks.append(
+                                self.markdown.htmlStash.store(block.strip()))
+                        continue
+                    else: 
+                        # if is block level tag and is not complete
+
+                        if util.isBlockLevel(left_tag) or left_tag == "--" \
+                            and not block.rstrip().endswith(">"):
+                            items.append(block.strip())
+                            in_tag = True
+                        else:
+                            new_blocks.append(
+                            self.markdown.htmlStash.store(block.strip()))
+
+                        continue
+
+                new_blocks.append(block)
+
+            else:
+                items.append(block)
+
+                right_tag, data_index = self._get_right_tag(left_tag, 0, block)
+
+                if self._equal_tags(left_tag, right_tag):
+                    # if find closing tag
+                    
+                    if data_index < len(block):
+                        # we have more text after right_tag
+                        items[-1] = block[:data_index]
+                        text.insert(0, block[data_index:])
+
+                    in_tag = False
+                    if self.markdown_in_raw and 'markdown' in attrs.keys():
+                        start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
+                                       '', items[0][:left_index])
+                        items[0] = items[0][left_index:]
+                        end = items[-1][-len(right_tag)-2:]
+                        items[-1] = items[-1][:-len(right_tag)-2]
+                        new_blocks.append(
+                            self.markdown.htmlStash.store(start))
+                        new_blocks.extend(items)
+                        new_blocks.append(
+                            self.markdown.htmlStash.store(end))
+                    else:
+                        new_blocks.append(
+                            self.markdown.htmlStash.store('\n\n'.join(items)))
+                    items = []
+
+        if items:
+            if self.markdown_in_raw and 'markdown' in attrs.keys():
+                start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
+                               '', items[0][:left_index])
+                items[0] = items[0][left_index:]
+                end = items[-1][-len(right_tag)-2:]
+                items[-1] = items[-1][:-len(right_tag)-2]
+                new_blocks.append(
+                    self.markdown.htmlStash.store(start))
+                new_blocks.extend(items)
+                if end.strip():
+                    new_blocks.append(
+                        self.markdown.htmlStash.store(end))
+            else:
+                new_blocks.append(
+                    self.markdown.htmlStash.store('\n\n'.join(items)))
+            #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
+            new_blocks.append('\n')
+
+        new_text = "\n\n".join(new_blocks)
+        return new_text.split("\n")
+
+
+class ReferencePreprocessor(Preprocessor):
+    """ Remove reference definitions from text and store for later use. """
+
+    TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
+    RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
+    TITLE_RE = re.compile(r'^%s$' % TITLE)
+
+    def run (self, lines):
+        new_text = [];
+        while lines:
+            line = lines.pop(0)
+            m = self.RE.match(line)
+            if m:
+                id = m.group(1).strip().lower()
+                link = m.group(2).lstrip('<').rstrip('>')
+                t = m.group(5) or m.group(6) or m.group(7)
+                if not t:
+                    # Check next line for title
+                    tm = self.TITLE_RE.match(lines[0])
+                    if tm:
+                        lines.pop(0)
+                        t = tm.group(2) or tm.group(3) or tm.group(4)
+                self.markdown.references[id] = (link, t)
+            else:
+                new_text.append(line)
+
+        return new_text #+ "\n"