From 7cb4f1d3d7504b6e45a2f3179f40c8b38cbcbf55 Mon Sep 17 00:00:00 2001 From: Gregory Soutade Date: Fri, 20 Mar 2020 16:55:45 +0100 Subject: [PATCH] Update Markdown parser from 2.1.1 to 2.3.2 --- dynastie/generators/markdown2.py | 545 +++++++++++++++++++++---------- 1 file changed, 365 insertions(+), 180 deletions(-) diff --git a/dynastie/generators/markdown2.py b/dynastie/generators/markdown2.py index fe3bcaa..d33a946 100755 --- a/dynastie/generators/markdown2.py +++ b/dynastie/generators/markdown2.py @@ -53,8 +53,9 @@ see for details): * header-ids: Adds "id" attributes to headers. The id value is a slug of the header text. * html-classes: Takes a dict mapping html tag names (lowercase) to a - string to use for a "class" tag attribute. Currently only supports - "pre" and "code" tags. Add an issue if you require this for other tags. + string to use for a "class" tag attribute. Currently only supports "img", + "table", "pre" and "code" tags. Add an issue if you require this for other + tags. * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to have markdown processing be done on its contents. Similar to but with @@ -70,9 +71,14 @@ see for details): * smarty-pants: Replaces ' and " with curly quotation marks or curly apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes, and ellipses. +* spoiler: A special kind of blockquote commonly hidden behind a + click on SO. Syntax per . * toc: The returned HTML string gets a new "toc_html" attribute which is a Table of Contents for the document. (experimental) * xml: Passes one-liner processing instructions and namespaced XML tags. +* tables: Tables using the same format as GFM + and + PHP-Markdown Extra . * wiki-tables: Google Code Wiki-style tables. See . """ @@ -82,13 +88,11 @@ see for details): # not yet sure if there implications with this. Compare 'pydoc sre' # and 'perldoc perlre'. -__version_info__ = (2, 1, 1) +__version_info__ = (2, 3, 2) __version__ = '.'.join(map(str, __version_info__)) __author__ = "Trent Mick" -import os import sys -from pprint import pprint import re import logging try: @@ -100,15 +104,9 @@ from random import random, randint import codecs -#---- Python version compat +# ---- Python version compat -try: - from urllib.parse import quote # python3 -except ImportError: - from urllib import quote # python2 - -if sys.version_info[:2] < (2,4): - from sets import Set as set +if sys.version_info[:2] < (2, 4): def reversed(sequence): for i in sequence[::-1]: yield i @@ -127,8 +125,7 @@ elif sys.version_info[0] >= 3: base_string_type = str - -#---- globals +# ---- globals DEBUG = False log = logging.getLogger("markdown") @@ -145,15 +142,12 @@ g_escape_table = dict([(ch, _hash_text(ch)) for ch in '\\`*_{}[]()>#+-.!']) - -#---- exceptions - +# ---- exceptions class MarkdownError(Exception): pass - -#---- public api +# ---- public api def markdown_path(path, encoding="utf-8", html4tags=False, tab_width=DEFAULT_TAB_WIDTH, @@ -167,6 +161,7 @@ def markdown_path(path, encoding="utf-8", link_patterns=link_patterns, use_file_vars=use_file_vars).convert(text) + def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, safe_mode=None, extras=None, link_patterns=None, use_file_vars=False): @@ -175,6 +170,7 @@ def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, link_patterns=link_patterns, use_file_vars=use_file_vars).convert(text) + class Markdown(object): # The dict of "extras" to enable in processing -- a mapping of # extra name to argument for the extra. Most extras do not have an @@ -222,7 +218,7 @@ class Markdown(object): extras = dict([(e, None) for e in extras]) self.extras.update(extras) assert isinstance(self.extras, dict) - if "toc" in self.extras and not "header-ids" in self.extras: + if "toc" in self.extras and "header-ids" not in self.extras: self.extras["header-ids"] = None # "toc" implies "header-ids" self._instance_extras = self.extras.copy() @@ -246,7 +242,7 @@ class Markdown(object): self.footnotes = {} self.footnote_ids = [] if "header-ids" in self.extras: - self._count_from_header_id = {} # no `defaultdict` in Python 2.4 + self._count_from_header_id = {} # no `defaultdict` in Python 2.4 if "metadata" in self.extras: self.metadata = {} @@ -254,6 +250,11 @@ class Markdown(object): # should only be used in tags with an "href" attribute. _a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE) + # Opens the linked document in a new window or tab + # should only used in tags with an "target" attribute. + # same with _a_nofollow + _a_blank = _a_nofollow + def convert(self, text): """Convert the given text.""" # Main function. The order in which other subs are called here is @@ -268,7 +269,7 @@ class Markdown(object): self.reset() if not isinstance(text, unicode): - #TODO: perhaps shouldn't presume UTF-8 for string input? + # TODO: perhaps shouldn't presume UTF-8 for string input? text = unicode(text, 'utf-8') if self.use_file_vars: @@ -288,7 +289,8 @@ class Markdown(object): self.extras[ename] = earg # Standardize line endings: - text = re.sub("\r\n|\r", "\n", text) + text = text.replace("\r\n", "\n") + text = text.replace("\r", "\n") # Make sure $text ends with a couple of newlines: text += "\n\n" @@ -308,13 +310,16 @@ class Markdown(object): text = self.preprocess(text) + if "fenced-code-blocks" in self.extras and not self.safe_mode: + text = self._do_fenced_code_blocks(text) + if self.safe_mode: text = self._hash_html_spans(text) # Turn block-level HTML blocks into hash entries text = self._hash_html_blocks(text, raw=True) - if "fenced-code-blocks" in self.extras: + if "fenced-code-blocks" in self.extras and self.safe_mode: text = self._do_fenced_code_blocks(text) # Strip link definitions, store in hashes. @@ -340,6 +345,9 @@ class Markdown(object): if "nofollow" in self.extras: text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text) + if "target-blank-links" in self.extras: + text = self._a_blank.sub(r'<\1 target="_blank"\2', text) + text += "\n" rv = UnicodeWithAttrs(text) @@ -363,18 +371,29 @@ class Markdown(object): """ return text - # Is metadata if the content starts with '---'-fenced `key: value` + # Is metadata if the content starts with optional '---'-fenced `key: value` # pairs. E.g. (indented for presentation): # --- # foo: bar # another-var: blah blah # --- - _metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""") + # # header + # or: + # foo: bar + # another-var: blah blah + # + # # header + + _metadata_pat = re.compile(r""" + ^ + (?:---[\ \t]*\n)? # optional "---" + ((?:[ \t]*[^ \t:]+[\ \t]*:[^\n]*\n)+) # "key: value" pairs + (?:---[ \t]*)? # optional "---" + \n""", + re.VERBOSE + ) def _extract_metadata(self, text): - # fast test - if not text.startswith("---"): - return text match = self._metadata_pat.match(text) if not match: return text @@ -387,7 +406,6 @@ class Markdown(object): return tail - _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE) # This regular expression is intended to match blocks like this: # PREFIX Local Variables: SUFFIX @@ -412,7 +430,7 @@ class Markdown(object): http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables """ emacs_vars = {} - SIZE = pow(2, 13) # 8kB + SIZE = pow(2, 13) # 8kB # Search near the start for a '-*-'-style one-liner of variables. head = text[:SIZE] @@ -448,7 +466,7 @@ class Markdown(object): prefix = match.group("prefix") suffix = match.group("suffix") lines = match.group("content").splitlines(0) - #print "prefix=%r, suffix=%r, content=%r, lines: %s"\ + # print "prefix=%r, suffix=%r, content=%r, lines: %s"\ # % (prefix, suffix, match.group("content"), lines) # Validate the Local Variables block: proper prefix and suffix @@ -469,9 +487,9 @@ class Markdown(object): # Parse out one emacs var per line. continued_for = None - for line in lines[:-1]: # no var on the last line ("PREFIX End:") - if prefix: line = line[len(prefix):] # strip prefix - if suffix: line = line[:-len(suffix)] # strip suffix + for line in lines[:-1]: # no var on the last line ("PREFIX End:") + if prefix: line = line[len(prefix):] # strip prefix + if suffix: line = line[:-len(suffix)] # strip suffix line = line.strip() if continued_for: variable = continued_for @@ -505,14 +523,19 @@ class Markdown(object): return emacs_vars - # Cribbed from a post by Bart Lateur: - # - _detab_re = re.compile(r'(.*?)\t', re.M) - def _detab_sub(self, match): - g1 = match.group(1) - return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width)) + def _detab_line(self, line): + r"""Recusively convert tabs to spaces in a single line. + + Called from _detab().""" + if '\t' not in line: + return line + chunk1, chunk2 = line.split('\t', 1) + chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width)) + output = chunk1 + chunk2 + return self._detab_line(output) + def _detab(self, text): - r"""Remove (leading?) tabs from a file. + r"""Iterate text line by line and convert tabs to spaces. >>> m = Markdown() >>> m._detab("\tfoo") @@ -528,7 +551,10 @@ class Markdown(object): """ if '\t' not in text: return text - return self._detab_re.subn(self._detab_sub, text)[0] + output = [] + for line in text.splitlines(): + output.append(self._detab_line(line)) + return '\n'.join(output) # I broke out the html5 tags here and add them to _block_tags_a and # _block_tags_b. This way html5 tags are easy to keep track of. @@ -776,12 +802,7 @@ class Markdown(object): re.X | re.M) return footnote_def_re.sub(self._extract_footnote_def_sub, text) - - _hr_data = [ - ('*', re.compile(r"^[ ]{0,3}\*(.*?)$", re.M)), - ('-', re.compile(r"^[ ]{0,3}\-(.*?)$", re.M)), - ('_', re.compile(r"^[ ]{0,3}\_(.*?)$", re.M)), - ] + _hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M) def _run_block_gamut(self, text): # These are all the transformations that form block-level @@ -798,13 +819,7 @@ class Markdown(object): # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the # hr chars to one or two. We'll reproduce that limit here. hr = "\n' % self._html_class_str_from_tag('table'), '', ''] + cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)).split('|')] + for col_idx, col in enumerate(cols): + hlines.append(' %s' % ( + align_from_col_idx.get(col_idx, ''), + self._run_span_gamut(col) + )) + hlines.append('') + hlines.append('') + + # tbody + hlines.append('') + for line in body.strip('\n').split('\n'): + hlines.append('') + cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)).split('|')] + for col_idx, col in enumerate(cols): + hlines.append(' %s' % ( + align_from_col_idx.get(col_idx, ''), + self._run_span_gamut(col) + )) + hlines.append('') + hlines.append('') + hlines.append('') + + return '\n'.join(hlines) + '\n' + + def _do_tables(self, text): + """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from + https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538 + """ + less_than_tab = self.tab_width - 1 + table_re = re.compile(r''' + (?:(?<=\n\n)|\A\n?) # leading blank line + + ^[ ]{0,%d} # allowed whitespace + (.*[|].*) \n # $1: header row (at least one pipe) + + ^[ ]{0,%d} # allowed whitespace + ( # $2: underline row + # underline row with leading bar + (?: \|\ *:?-+:?\ * )+ \|? \n + | + # or, underline row without leading bar + (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n + ) + + ( # $3: data rows + (?: + ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces + .*\|.* \n + )+ + ) + ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X) + return table_re.sub(self._table_sub, text) + def _wiki_table_sub(self, match): ttext = match.group(0).strip() - #print 'wiki table: %r' % match.group(0) + # print 'wiki table: %r' % match.group(0) rows = [] for line in ttext.splitlines(0): line = line.strip()[2:-2].strip() row = [c.strip() for c in re.split(r'(?', ''] + # pprint(rows) + hlines = ['' % self._html_class_str_from_tag('table'), ''] for row in rows: hrow = [''] for cell in row: @@ -907,13 +997,19 @@ class Markdown(object): text = self._encode_amps_and_angles(text) + if "strike" in self.extras: + text = self._do_strike(text) + text = self._do_italics_and_bold(text) if "smarty-pants" in self.extras: text = self._do_smart_punctuation(text) # Do hard breaks: - text = re.sub(r" {2,}\n", " # \1 - <.*?> - | - .*? - ) - [ \t]* - ( # \2 - (['"]) # quote char = \3 + _inline_link_title = re.compile(r''' + ( # \1 + [ \t]+ + (['"]) # quote char = \2 (?P.*?) - \3 # matching quote + \2 )? # title is optional - \) + \)$ ''', re.X | re.S) _tail_of_reference_link_re = re.compile(r''' # Match tail of: [text][id] @@ -1029,6 +1117,52 @@ class Markdown(object): \] ''', re.X | re.S) + _whitespace = re.compile(r'\s*') + + _strip_anglebrackets = re.compile(r'<(.*)>.*') + + def _find_non_whitespace(self, text, start): + """Returns the index of the first non-whitespace character in text + after (and including) start + """ + match = self._whitespace.match(text, start) + return match.end() + + def _find_balanced(self, text, start, open_c, close_c): + """Returns the index where the open_c and close_c characters balance + out - the same number of open_c and close_c are encountered - or the + end of string if it's reached before the balance point is found. + """ + i = start + l = len(text) + count = 1 + while count > 0 and i < l: + if text[i] == open_c: + count += 1 + elif text[i] == close_c: + count -= 1 + i += 1 + return i + + def _extract_url_and_title(self, text, start): + """Extracts the url and (optional) title from the tail of a link""" + # text[start] equals the opening parenthesis + idx = self._find_non_whitespace(text, start+1) + if idx == len(text): + return None, None, None + end_idx = idx + has_anglebrackets = text[idx] == "<" + if has_anglebrackets: + end_idx = self._find_balanced(text, end_idx+1, "<", ">") + end_idx = self._find_balanced(text, end_idx, "(", ")") + match = self._inline_link_title.search(text, idx, end_idx) + if not match: + return None, None, None + url, title = text[idx:match.start()], match.group("title") + if has_anglebrackets: + url = self._strip_anglebrackets.sub(r'\1', url) + return url, title, end_idx + def _do_links(self, text): """Turn Markdown link shortcuts into XHTML <a> and <img> tags. @@ -1046,7 +1180,7 @@ class Markdown(object): anchor_allowed_pos = 0 curr_pos = 0 - while True: # Handle the next link. + while True: # Handle the next link. # The next '[' is the start of: # - an inline anchor: [text](url "title") # - a reference anchor: [text][id] @@ -1110,9 +1244,9 @@ class Markdown(object): return text # Inline anchor or img? - if text[p] == '(': # attempt at perf improvement - match = self._tail_of_inline_link_re.match(text, p) - if match: + if text[p] == '(': # attempt at perf improvement + url, title, url_end_idx = self._extract_url_and_title(text, p) + if url is not None: # Handle an inline anchor or img. is_img = start_idx > 0 and text[start_idx-1] == "!" if is_img: @@ -1120,12 +1254,9 @@ class Markdown(object): is_inline_img = start_idx > 0 and text[start_idx-1] == "#" if is_inline_img: - start_idx -= 1 - is_img = 1 + start_idx -= 1 + is_img = 1 - url, title = match.group("url"), match.group("title") - if url and url[0] == '<': - url = url[1:-1] # '<url>' -> 'url' # We've got to encode these to avoid conflicting # with italics/bold. url = url.replace('*', self._escape_table['*']) \ @@ -1138,20 +1269,17 @@ class Markdown(object): else: title_str = '' if is_img: + img_class_str = self._html_class_str_from_tag("img") if is_inline_img: - result = '<img class="inlineimage" src="%s" alt="%s"%s%s' \ - % (url.replace('"', '"'), - _xml_escape_attr(link_text), - title_str, self.empty_element_suffix) - else: - result = '<img src="%s" alt="%s"%s%s' \ - % (url.replace('"', '"'), - _xml_escape_attr(link_text), - title_str, self.empty_element_suffix) + img_class_str = ' class="inlineimage"' + result = '<img src="%s" alt="%s"%s%s%s' \ + % (url.replace('"', '"'), + _xml_escape_attr(link_text), + title_str, img_class_str, self.empty_element_suffix) if "smarty-pants" in self.extras: result = result.replace('"', self._escape_table['"']) curr_pos = start_idx + len(result) - text = text[:start_idx] + result + text[match.end():] + text = text[:start_idx] + result + text[url_end_idx:] elif start_idx >= anchor_allowed_pos: result_head = '<a href="%s"%s>' % (url, title_str) result = '%s%s</a>' % (result_head, link_text) @@ -1161,7 +1289,7 @@ class Markdown(object): # anchor_allowed_pos on. curr_pos = start_idx + len(result_head) anchor_allowed_pos = start_idx + len(result) - text = text[:start_idx] + result + text[match.end():] + text = text[:start_idx] + result + text[url_end_idx:] else: # Anchor not allowed here. curr_pos = start_idx + 1 @@ -1186,7 +1314,6 @@ class Markdown(object): .replace('_', self._escape_table['_']) title = self.titles.get(link_id) if title: - before = title title = _xml_escape_attr(title) \ .replace('*', self._escape_table['*']) \ .replace('_', self._escape_table['_']) @@ -1194,10 +1321,11 @@ class Markdown(object): else: title_str = '' if is_img: - result = '<img src="%s" alt="%s"%s%s' \ + img_class_str = self._html_class_str_from_tag("img") + result = '<img src="%s" alt="%s"%s%s%s' \ % (url.replace('"', '"'), link_text.replace('"', '"'), - title_str, self.empty_element_suffix) + title_str, img_class_str, self.empty_element_suffix) if "smarty-pants" in self.extras: result = result.replace('"', self._escape_table['"']) curr_pos = start_idx + len(result) @@ -1258,44 +1386,42 @@ class Markdown(object): self._toc = [] self._toc.append((level, id, self._unescape_special_chars(name))) - _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M) - def _setext_h_sub(self, match): - n = {"=": 1, "-": 2}[match.group(2)[0]] - demote_headers = self.extras.get("demote-headers") - if demote_headers: - n = min(n + demote_headers, 6) - header_id_attr = "" - if "header-ids" in self.extras: - header_id = self.header_id_from_text(match.group(1), - self.extras["header-ids"], n) - if header_id: - header_id_attr = ' id="%s"' % header_id - html = self._run_span_gamut(match.group(1)) - if "toc" in self.extras and header_id: - self._toc_add_entry(n, header_id, html) - return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n) - - _atx_h_re = re.compile(r''' - ^(\#{1,6}) # \1 = string of #'s - [ \t]+ + _h_re_base = r''' + (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+) + | + (^(\#{1,6}) # \1 = string of #'s + [ \t]%s (.+?) # \2 = Header text [ \t]* (?<!\\) # ensure not an escaped trailing '#' \#* # optional closing #'s (not counted) \n+ - ''', re.X | re.M) - def _atx_h_sub(self, match): - n = len(match.group(1)) + ) + ''' + + _h_re = re.compile(_h_re_base % '*', re.X | re.M) + _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M) + + def _h_sub(self, match): + if match.group(1) is not None: + # Setext header + n = {"=": 1, "-": 2}[match.group(3)[0]] + header_group = match.group(2) + else: + # atx header + n = len(match.group(5)) + header_group = match.group(6) + demote_headers = self.extras.get("demote-headers") if demote_headers: n = min(n + demote_headers, 6) header_id_attr = "" if "header-ids" in self.extras: - header_id = self.header_id_from_text(match.group(2), + header_id = self.header_id_from_text(header_group, self.extras["header-ids"], n) if header_id: header_id_attr = ' id="%s"' % header_id - html = self._run_span_gamut(match.group(2)) + html = self._run_span_gamut(header_group) if "toc" in self.extras and header_id: self._toc_add_entry(n, header_id, html) return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n) @@ -1307,7 +1433,6 @@ class Markdown(object): # # Header 2 # -------- - text = self._setext_h_re.sub(self._setext_h_sub, text) # atx-style headers: # # Header 1 @@ -1315,12 +1440,12 @@ class Markdown(object): # ## Header 2 with closing hashes ## # ... # ###### Header 6 - text = self._atx_h_re.sub(self._atx_h_sub, text) - return text + if 'tag-friendly' in self.extras: + return self._h_re_tag_friendly.sub(self._h_sub, text) + return self._h_re.sub(self._h_sub, text) - - _marker_ul_chars = '*+-' + _marker_ul_chars = '*+-' _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars _marker_ul = '(?:[%s])' % _marker_ul_chars _marker_ol = r'(?:\d+\.)' @@ -1380,8 +1505,9 @@ class Markdown(object): hits.sort() match = hits[0][1] start, end = match.span() - text = text[:start] + self._list_sub(match) + text[end:] - pos = end + middle = self._list_sub(match) + text = text[:start] + middle + text[end:] + pos = start + len(middle) # start pos for next attempted match return text @@ -1390,16 +1516,30 @@ class Markdown(object): (^[ \t]*) # leading whitespace = \2 (?P<marker>%s) [ \t]+ # list marker = \3 ((?:.+?) # list item text = \4 - (\n{1,2})) # eols = \5 + (\n{1,2})) # eols = \5 (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+)) ''' % (_marker_any, _marker_any), re.M | re.X | re.S) + _task_list_item_re = re.compile(r''' + (\[[\ x]\])[ \t]+ # tasklist marker = \1 + (.*) # list item text = \2 + ''', re.M | re.X | re.S) + + _task_list_warpper_str = r'<p><input type="checkbox" class="task-list-item-checkbox" %sdisabled>%s</p>' + + def _task_list_item_sub(self, match): + marker = match.group(1) + item_text = match.group(2) + if marker == '[x]': + return self._task_list_warpper_str % ('checked ', item_text) + elif marker == '[ ]': + return self._task_list_warpper_str % ('', item_text) + _last_li_endswith_two_eols = False def _list_item_sub(self, match): item = match.group(4) leading_line = match.group(1) - leading_space = match.group(2) if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: item = self._run_block_gamut(self._outdent(item)) else: @@ -1409,6 +1549,10 @@ class Markdown(object): item = item[:-1] item = self._run_span_gamut(item) self._last_li_endswith_two_eols = (len(match.group(5)) == 2) + + if "task_list" in self.extras: + item = self._task_list_item_re.sub(self._task_list_item_sub, item) + return "<li>%s</li>\n" % item def _process_list_items(self, list_str): @@ -1497,8 +1641,20 @@ class Markdown(object): formatter_opts = self.extras['code-color'] or {} if lexer_name: + def unhash_code(codeblock): + for key, sanitized in list(self.html_spans.items()): + codeblock = codeblock.replace(key, sanitized) + replacements = [ + ("&", "&"), + ("<", "<"), + (">", ">") + ] + for old, new in replacements: + codeblock = codeblock.replace(old, new) + return codeblock lexer = self._get_pygments_lexer(lexer_name) if lexer: + codeblock = unhash_code( codeblock ) colored = self._color_with_pygments(codeblock, lexer, **formatter_opts) return "\n\n%s\n\n" % colored @@ -1535,19 +1691,22 @@ class Markdown(object): )+ ) ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc + # Lookahead to make sure this block isn't already in a code block. + # Needed when syntax highlighting is being used. + (?![^<]*\</code\>) ''' % (self.tab_width, self.tab_width), re.M | re.X) return code_block_re.sub(self._code_block_sub, text) _fenced_code_block_re = re.compile(r''' - (?:\n\n|\A\n?) + (?:\n+|\A\n?) ^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang (.*?) # $2 = code block content ^```[ \t]*\n # closing fence ''', re.M | re.X | re.S) def _fenced_code_block_sub(self, match): - return self._code_block_sub(match, is_fenced_code_block=True); + return self._code_block_sub(match, is_fenced_code_block=True) def _do_fenced_code_blocks(self, text): """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra).""" @@ -1620,12 +1779,17 @@ class Markdown(object): self._escape_table[text] = hashed return hashed + _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S) + def _do_strike(self, text): + text = self._strike_re.sub(r"<strike>\1</strike>", text) + return text + _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) - _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) - _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) _code_friendly_line_re = re.compile(r"\~\~(?=\S)(.+?)(?<=\S)\~\~", re.S) _code_friendly_underline_re = re.compile(r"\~(?=\S)(.+?)(?<=\S)\~", re.S) + _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) + _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) def _do_italics_and_bold(self, text): # <strong> must go first: if "code-friendly" in self.extras: @@ -1670,12 +1834,12 @@ class Markdown(object): <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a discussion of some diversion from the original SmartyPants. """ - if "'" in text: # guard for perf + if "'" in text: # guard for perf text = self._do_smart_contractions(text) text = self._opening_single_quote_re.sub("‘", text) text = self._closing_single_quote_re.sub("’", text) - if '"' in text: # guard for perf + if '"' in text: # guard for perf text = self._opening_double_quote_re.sub("“", text) text = self._closing_double_quote_re.sub("”", text) @@ -1686,38 +1850,53 @@ class Markdown(object): text = text.replace(". . .", "…") return text - _block_quote_re = re.compile(r''' + _block_quote_base = r''' ( # Wrap whole match in \1 ( - ^[ \t]*>[ \t]? # '>' at the start of a line + ^[ \t]*>%s[ \t]? # '>' at the start of a line .+\n # rest of the first line (.+\n)* # subsequent consecutive lines \n* # blanks )+ ) - ''', re.M | re.X) - _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M); - + ''' + _block_quote_re = re.compile(_block_quote_base % '', re.M | re.X) + _block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X) + _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M) + _bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M) + _bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M) _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) def _dedent_two_spaces_sub(self, match): return re.sub(r'(?m)^ ', '', match.group(1)) def _block_quote_sub(self, match): bq = match.group(1) - bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting - bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines + is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq) + # trim one level of quoting + if is_spoiler: + bq = self._bq_one_level_re_spoiler.sub('', bq) + else: + bq = self._bq_one_level_re.sub('', bq) + # trim whitespace-only lines + bq = self._ws_only_line_re.sub('', bq) bq = self._run_block_gamut(bq) # recurse bq = re.sub('(?m)^', ' ', bq) # These leading spaces screw with <pre> content, so we need to fix that: bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) - return "<blockquote>\n%s\n</blockquote>\n\n" % bq + if is_spoiler: + return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq + else: + return '<blockquote>\n%s\n</blockquote>\n\n' % bq def _do_block_quotes(self, text): if '>' not in text: return text - return self._block_quote_re.sub(self._block_quote_sub, text) + if 'spoiler' in self.extras: + return self._block_quote_re_spoiler.sub(self._block_quote_sub, text) + else: + return self._block_quote_re.sub(self._block_quote_sub, text) def _form_paragraphs(self, text): # Strip leading and trailing lines: @@ -1774,7 +1953,7 @@ class Markdown(object): '↩</a>' % (id, i+1)) if footer[-1].endswith("</p>"): footer[-1] = footer[-1][:-len("</p>")] \ - + ' ' + backlink + "</p>" + + ' ' + backlink + "</p>" else: footer.append("\n<p>%s</p>" % backlink) footer.append('</li>') @@ -1910,7 +2089,7 @@ class MarkdownWithExtras(Markdown): extras = ["footnotes", "code-color"] -#---- internal support functions +# ---- internal support functions class UnicodeWithAttrs(unicode): """A subclass of unicode used for the return value of conversion to @@ -1979,6 +2158,7 @@ def _curry(*args, **kwargs): return function(*args + rest, **combined) return result + # Recipe: regex_from_encoded_pattern (1.0) def _regex_from_encoded_pattern(s): """'foo' -> re.compile(re.escape('foo')) @@ -2005,9 +2185,10 @@ def _regex_from_encoded_pattern(s): "(must be one of '%s')" % (char, s, ''.join(list(flag_from_char.keys())))) return re.compile(s[1:idx], flags) - else: # not an encoded regex + else: # not an encoded regex return re.compile(re.escape(s)) + # Recipe: dedent (0.1.2) def _dedentlines(lines, tabsize=8, skip_first_line=False): """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines @@ -2025,7 +2206,6 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False): if DEBUG: print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ % (tabsize, skip_first_line)) - indents = [] margin = None for i, line in enumerate(lines): if i == 0 and skip_first_line: continue @@ -2036,11 +2216,11 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False): elif ch == '\t': indent += tabsize - (indent % tabsize) elif ch in '\r\n': - continue # skip all-whitespace lines + continue # skip all-whitespace lines else: break else: - continue # skip all-whitespace lines + continue # skip all-whitespace lines if DEBUG: print("dedent: indent=%d: %r" % (indent, line)) if margin is None: margin = indent @@ -2079,6 +2259,7 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False): lines[i] = lines[i][removed:] return lines + def _dedent(text, tabsize=8, skip_first_line=False): """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text @@ -2096,28 +2277,30 @@ def _dedent(text, tabsize=8, skip_first_line=False): class _memoized(object): - """Decorator that caches a function's return value each time it is called. - If called later with the same arguments, the cached value is returned, and - not re-evaluated. + """Decorator that caches a function's return value each time it is called. + If called later with the same arguments, the cached value is returned, and + not re-evaluated. - http://wiki.python.org/moin/PythonDecoratorLibrary - """ - def __init__(self, func): - self.func = func - self.cache = {} - def __call__(self, *args): - try: - return self.cache[args] - except KeyError: - self.cache[args] = value = self.func(*args) - return value - except TypeError: - # uncachable -- for instance, passing a list as an argument. - # Better to not cache than to blow up entirely. - return self.func(*args) - def __repr__(self): - """Return the function's docstring.""" - return self.func.__doc__ + http://wiki.python.org/moin/PythonDecoratorLibrary + """ + def __init__(self, func): + self.func = func + self.cache = {} + + def __call__(self, *args): + try: + return self.cache[args] + except KeyError: + self.cache[args] = value = self.func(*args) + return value + except TypeError: + # uncachable -- for instance, passing a list as an argument. + # Better to not cache than to blow up entirely. + return self.func(*args) + + def __repr__(self): + """Return the function's docstring.""" + return self.func.__doc__ def _xml_oneliner_re_from_tab_width(tab_width): @@ -2141,8 +2324,9 @@ def _xml_oneliner_re_from_tab_width(tab_width): """ % (tab_width - 1), re.X) _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width) + def _hr_tag_re_from_tab_width(tab_width): - return re.compile(r""" + return re.compile(r""" (?: (?<=\n\n) # Starting after a blank line | # or @@ -2191,18 +2375,19 @@ def _xml_encode_email_char_at_random(ch): return '&#%s;' % ord(ch) - -#---- mainline +# ---- mainline class _NoReflowFormatter(optparse.IndentedHelpFormatter): """An optparse formatter that does NOT reflow the description.""" def format_description(self, description): return description or "" + def _test(): import doctest doctest.testmod() + def main(argv=None): if argv is None: argv = sys.argv @@ -2319,7 +2504,7 @@ def main(argv=None): sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) if extras and "toc" in extras: log.debug("toc_html: " + - html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) + str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))) if opts.compare: test_dir = join(dirname(dirname(abspath(__file__))), "test") if exists(join(test_dir, "test_markdown2.py")): @@ -2334,4 +2519,4 @@ def main(argv=None): if __name__ == "__main__": - sys.exit( main(sys.argv) ) + sys.exit(main(sys.argv))