From 64af5997aaefa98599eb5ad73f08438083ab0767 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sat, 4 Oct 2025 10:51:18 +0100 Subject: [PATCH 1/5] Fix #641 --- lib/markdown2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 488b24cc..01dce483 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -3320,7 +3320,7 @@ def __init__(self, md: Markdown, options: Union[dict, bool, None]): self.middle_word_em_re = re.compile( r''' (? Date: Sat, 4 Oct 2025 11:55:32 +0100 Subject: [PATCH 2/5] Fix #642 --- lib/markdown2.py | 27 +++++++++++++++++++++++++-- test/tm-cases/ems_across_spans.html | 1 + test/tm-cases/ems_across_spans.text | 1 + 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 test/tm-cases/ems_across_spans.html create mode 100644 test/tm-cases/ems_across_spans.text diff --git a/lib/markdown2.py b/lib/markdown2.py index 01dce483..dde8f704 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1993,9 +1993,32 @@ def _encode_code(self, text: str) -> str: @mark_stage(Stage.ITALIC_AND_BOLD) def _do_italics_and_bold(self, text: str) -> str: + def sub(match: re.Match): + ''' + regex sub function that checks that the match isn't matching across spans. + The span shouldn't be across a closing or opening HTML tag, although spans within + the span is acceptable. + ''' + contents: str = match.group(2) + # look for all possible span HTML tags + for tag in re.findall(rf'abcdef_`, which is across 2 spans + close_index = contents.find(f'{contents}' + # must go first: - text = self._strong_re.sub(r"\2", text) - text = self._em_re.sub(r"\2", text) + text = self._strong_re.sub(sub, text) + text = self._em_re.sub(sub, text) return text _block_quote_base = r''' diff --git a/test/tm-cases/ems_across_spans.html b/test/tm-cases/ems_across_spans.html new file mode 100644 index 00000000..daef521f --- /dev/null +++ b/test/tm-cases/ems_across_spans.html @@ -0,0 +1 @@ +

_confusing ident is _confusing

diff --git a/test/tm-cases/ems_across_spans.text b/test/tm-cases/ems_across_spans.text new file mode 100644 index 00000000..40cd465c --- /dev/null +++ b/test/tm-cases/ems_across_spans.text @@ -0,0 +1 @@ +**_confusing** ident is **_confusing** \ No newline at end of file From 9a48294af4c0c5c794ea77907f700c3f67085fe8 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 5 Oct 2025 16:46:36 +0100 Subject: [PATCH 3/5] Fix #643 --- lib/markdown2.py | 19 +++++++++++++++---- .../Strong and em together.html | 8 ++++---- test/tm-cases/consecutive_strong_and_em.html | 1 + test/tm-cases/consecutive_strong_and_em.text | 1 + test/tm-cases/middle_word_em_issue641.html | 3 +++ test/tm-cases/middle_word_em_issue641.opts | 1 + test/tm-cases/middle_word_em_issue641.text | 3 +++ .../middle_word_em_with_extra_ems.html | 2 +- 8 files changed, 29 insertions(+), 9 deletions(-) create mode 100644 test/tm-cases/consecutive_strong_and_em.html create mode 100644 test/tm-cases/consecutive_strong_and_em.text create mode 100644 test/tm-cases/middle_word_em_issue641.html create mode 100644 test/tm-cases/middle_word_em_issue641.opts create mode 100644 test/tm-cases/middle_word_em_issue641.text diff --git a/lib/markdown2.py b/lib/markdown2.py index dde8f704..bd958c5d 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1988,7 +1988,16 @@ def _encode_code(self, text: str) -> str: self._code_table[text] = hashed return hashed - _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]?)(?<=\S)\1", re.S) + _strong_re = re.compile(r''' + [*_]* # ignore any leading em chars because we want to wrap `` as tightly around the text as possible + # eg: `***abc***` -> `*abc*` instead of `*abc*` + # Makes subsequent processing easier + (\*\*|__)(?=\S) # strong syntax - must be followed by a non whitespace char + (.+?) # the strong text itself + (?<=\S)\1 # closing syntax - must be preceeded by non whitespace char + ''', + re.S | re.X + ) _em_re = re.compile(r"(\*|_)(?=\S)(.*?\S)\1", re.S) @mark_stage(Stage.ITALIC_AND_BOLD) @@ -2000,21 +2009,23 @@ def sub(match: re.Match): the span is acceptable. ''' contents: str = match.group(2) + # the strong re also checks for leading em chars, so the match may cover some additional text + prefix = match.string[match.start(): match.regs[1][0]] # look for all possible span HTML tags for tag in re.findall(rf'abcdef_`, which is across 2 spans close_index = contents.find(f'{contents}' + return f'{prefix}<{syntax}>{contents}' # must go first: text = self._strong_re.sub(sub, text) diff --git a/test/markdowntest-cases/Strong and em together.html b/test/markdowntest-cases/Strong and em together.html index 71ec78c7..bab1b98f 100644 --- a/test/markdowntest-cases/Strong and em together.html +++ b/test/markdowntest-cases/Strong and em together.html @@ -1,7 +1,7 @@ -

This is strong and em.

+

This is strong and em.

-

So is this word.

+

So is this word.

-

This is strong and em.

+

This is strong and em.

-

So is this word.

+

So is this word.

diff --git a/test/tm-cases/consecutive_strong_and_em.html b/test/tm-cases/consecutive_strong_and_em.html new file mode 100644 index 00000000..6478dd07 --- /dev/null +++ b/test/tm-cases/consecutive_strong_and_em.html @@ -0,0 +1 @@ +

strongemstrong

diff --git a/test/tm-cases/consecutive_strong_and_em.text b/test/tm-cases/consecutive_strong_and_em.text new file mode 100644 index 00000000..663723f9 --- /dev/null +++ b/test/tm-cases/consecutive_strong_and_em.text @@ -0,0 +1 @@ +**strong***em***strong** diff --git a/test/tm-cases/middle_word_em_issue641.html b/test/tm-cases/middle_word_em_issue641.html new file mode 100644 index 00000000..39886631 --- /dev/null +++ b/test/tm-cases/middle_word_em_issue641.html @@ -0,0 +1,3 @@ +

Strong (em)

+ +

note:this is good, but this is not

diff --git a/test/tm-cases/middle_word_em_issue641.opts b/test/tm-cases/middle_word_em_issue641.opts new file mode 100644 index 00000000..f1455c41 --- /dev/null +++ b/test/tm-cases/middle_word_em_issue641.opts @@ -0,0 +1 @@ +{'extras': {'middle-word-em': False}} \ No newline at end of file diff --git a/test/tm-cases/middle_word_em_issue641.text b/test/tm-cases/middle_word_em_issue641.text new file mode 100644 index 00000000..b14e5d28 --- /dev/null +++ b/test/tm-cases/middle_word_em_issue641.text @@ -0,0 +1,3 @@ +**Strong** (*em*) + +note:*this is good*, but *this is not* \ No newline at end of file diff --git a/test/tm-cases/middle_word_em_with_extra_ems.html b/test/tm-cases/middle_word_em_with_extra_ems.html index a86b1932..a8974039 100644 --- a/test/tm-cases/middle_word_em_with_extra_ems.html +++ b/test/tm-cases/middle_word_em_with_extra_ems.html @@ -2,7 +2,7 @@

one_two_three

-

one_two_three

+

one_two_three

one_two_three

From 3173942d11688c0682570cd7907a095b5259114f Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 5 Oct 2025 17:54:22 +0100 Subject: [PATCH 4/5] Fix ReDoS regression --- lib/markdown2.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index bd958c5d..8b99ec3d 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1989,12 +1989,12 @@ def _encode_code(self, text: str) -> str: return hashed _strong_re = re.compile(r''' - [*_]* # ignore any leading em chars because we want to wrap `` as tightly around the text as possible - # eg: `***abc***` -> `*abc*` instead of `*abc*` - # Makes subsequent processing easier - (\*\*|__)(?=\S) # strong syntax - must be followed by a non whitespace char - (.+?) # the strong text itself - (?<=\S)\1 # closing syntax - must be preceeded by non whitespace char + (?:_{1,}|\*{1,})? # ignore any leading em chars because we want to wrap `` as tightly around the text as possible + # eg: `***abc***` -> `*abc*` instead of `*abc*` + # Makes subsequent processing easier + (\*\*|__)(?=\S) # strong syntax - must be followed by a non whitespace char + (.+?) # the strong text itself + (?<=\S)\1 # closing syntax - must be preceeded by non whitespace char ''', re.S | re.X ) From 40bd17f43735609858779c57e7f12a29991f3a2c Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 5 Oct 2025 17:56:55 +0100 Subject: [PATCH 5/5] update changelog --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 11db62d5..60c03486 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ - [pull #639] Fix middle-word-em interfering with strongs (#637) - [pull #640] Fix code friendly extra stopping other syntax being processed (#638) +- [pull #644] Fix a number of em/strong issues (#641, #642, #643) ## python-markdown2 2.5.4