Skip to content

Commit 1e4f0ba

Browse files
authored
[markdown] Extend Markdown rules to support common CJK formatting grammar (#16)
Chinese and Japanese content usually do _not_ include spaces between formatted and unformatted segments of a single phrase, such as `**{value}**件の投稿`. But this is technically not valid `strong` formatting according to the CommonMark spec, since the right flank of the ending delimiter is a non-space Unicode character. See more information in the CommonMark discussion here: https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5 commonmark/cmark#208 Because this library is explicitly intended to support many languages including most Asian languages, we are adding an extension to the Markdown rules to accommodate these situations. The following tests assert that the special cases for East Asian languages function in a logically-similar way to Western languages. The tests for this change are pretty small, as I'm not fluent in anything near CJK and have purely gone off of suggestions and forums to enumerate these. Most importantly, `**{value}**件の投稿`, is now treated as a **bold** `value` followed by plain text, rather than being completely ignored.
1 parent 7c87eb3 commit 1e4f0ba

File tree

6 files changed

+104
-15
lines changed

6 files changed

+104
-15
lines changed

Cargo.lock

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/intl_markdown/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ doctest = false
1313

1414
[dependencies]
1515
bitflags = "2"
16+
cjk = "0.2.5"
1617
intl_markdown_macros = { workspace = true }
1718
serde = { workspace = true }
1819
serde_json = { workspace = true }

crates/intl_markdown/src/lexer.rs

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ pub(super) struct LexerState {
2323
pub last_was_whitespace: bool,
2424
pub last_was_punctuation: bool,
2525
pub last_was_newline: bool,
26+
pub last_was_cjk_punctuation: bool,
2627
/// True if the last token was entirely an escaped token, which has an
2728
/// effect on whether the next token is considered punctuation or not when
2829
/// computing delimiters.
@@ -45,6 +46,7 @@ impl LexerState {
4546
last_was_newline: true,
4647
last_was_whitespace: true,
4748
last_was_punctuation: false,
49+
last_was_cjk_punctuation: false,
4850
last_token_was_escape: false,
4951
is_after_newline: true,
5052
}
@@ -56,6 +58,7 @@ impl LexerState {
5658
self.last_was_whitespace = true;
5759
self.last_was_newline = true;
5860
self.last_was_punctuation = false;
61+
self.last_was_cjk_punctuation = false;
5962
self.last_token_was_escape = false;
6063
self.is_after_newline = true;
6164
}
@@ -215,9 +218,9 @@ impl<'source> Lexer<'source> {
215218
c if self.state.last_was_newline
216219
&& c.is_ascii_whitespace()
217220
&& self.state.indent_depth > 0 =>
218-
{
219-
self.consume_leading_whitespace()
220-
}
221+
{
222+
self.consume_leading_whitespace()
223+
}
221224
b'\0' => self.consume_byte(SyntaxKind::EOF),
222225
_ => self.consume_verbatim_line(),
223226
}
@@ -585,6 +588,9 @@ impl<'source> Lexer<'source> {
585588
GeneralCategoryGroup::Punctuation | GeneralCategoryGroup::Symbol
586589
)
587590
});
591+
// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
592+
// https://github.com/commonmark/cmark/pull/208
593+
let next_is_cjk = next.map_or(false, |c| !c.is_ascii() && cjk::is_cjk_codepoint(c));
588594
let next_is_escaped = matches!(next, Some('\\'));
589595

590596
let mut flags = TokenFlags::default();
@@ -594,12 +600,18 @@ impl<'source> Lexer<'source> {
594600
if self.state.last_was_punctuation && !self.state.last_token_was_escape {
595601
flags.insert(TokenFlags::HAS_PRECEDING_PUNCTUATION);
596602
}
603+
if self.state.last_was_cjk_punctuation {
604+
flags.insert(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
605+
}
597606
if next_is_whitespace {
598607
flags.insert(TokenFlags::HAS_FOLLOWING_WHITESPACE);
599608
}
600609
if next_is_punctuation && !next_is_escaped {
601610
flags.insert(TokenFlags::HAS_FOLLOWING_PUNCTUATION);
602611
}
612+
if next_is_cjk {
613+
flags.insert(TokenFlags::HAS_FOLLOWING_CJK);
614+
}
603615

604616
self.advance();
605617

@@ -1043,6 +1055,10 @@ impl<'source> Lexer<'source> {
10431055
} else {
10441056
last_char.is_ascii_punctuation()
10451057
};
1058+
// [cjk] includes all ascii characters as CJK punctuation for some reason, which we
1059+
// specifically do not want to match here, so the check is also guarded that the character
1060+
// is not plain ASCII.
1061+
self.state.last_was_cjk_punctuation = !last_char.is_ascii() && cjk::is_cjk_punctuation_codepoint(last_char);
10461062

10471063
self.state.last_was_newline = last_char == '\n';
10481064
self.state.last_was_whitespace = last_char.is_whitespace();

crates/intl_markdown/src/parser/delimiter.rs

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
use std::ops::Range;
22

3-
use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};
43
use crate::delimiter::Delimiter;
54
use crate::parser::emphasis::process_emphasis;
5+
use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};
66

77
use super::ICUMarkdownParser;
88

@@ -59,22 +59,24 @@ pub(super) fn parse_delimiter_run(p: &mut ICUMarkdownParser, kind: SyntaxKind) -
5959
!first_flags.has_preceding_whitespace()
6060
// 2. Either:
6161
&& (
62-
// - not preceded by a punctuation. OR
63-
!first_flags.has_preceding_punctuation()
62+
// - not preceded by a punctuation. OR
63+
// (CJK extension: preceding CJK punctuation is allowed)
64+
(!first_flags.has_preceding_punctuation() || first_flags.has_preceding_cjk_punctuation())
6465
// - preceded by punctuation but followed by whitespace or punctuation
65-
|| (last_flags.has_following_whitespace() || last_flags.has_following_punctuation())
66-
);
66+
// (CJK extension: following CJK characters are allowed)
67+
|| (last_flags.has_following_whitespace() || last_flags.has_following_punctuation() || last_flags.has_following_cjk())
68+
);
6769

6870
// Left-flanking definition
6971
// 1. Not followed by whitespace AND
7072
let is_left_flanking = !last_flags.has_following_whitespace()
71-
// 2. Either:
72-
&& (
73-
// - not followed by a punctuation. OR
74-
!last_flags.has_following_punctuation()
75-
// - followed by punctuation but preceded by whitespace or punctuation.
76-
|| (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
77-
);
73+
// 2. Either:
74+
&& (
75+
// - not followed by a punctuation. OR
76+
!last_flags.has_following_punctuation()
77+
// - followed by punctuation but preceded by whitespace or punctuation.
78+
|| (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
79+
);
7880

7981
// Using the determined flanking and context flags and the `kind` of the
8082
// token, determine if it can be used to open and/or close emphasis.

crates/intl_markdown/src/token.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ bitflags! {
1919
// Only used for some delimiters currently. `ESCAPED` kinds will also
2020
// always have this set.
2121
const IS_ESCAPED = 1 << 6;
22+
23+
// Extension for support delimiters around CJK script characters.
24+
// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
25+
// https://github.com/commonmark/cmark/pull/208
26+
const HAS_PRECEDING_CJK_PUNCTUATION = 1 << 5;
27+
const HAS_FOLLOWING_CJK = 1 << 6;
2228
}
2329
}
2430

@@ -39,6 +45,13 @@ impl TokenFlags {
3945
pub fn is_escaped(&self) -> bool {
4046
self.contains(TokenFlags::IS_ESCAPED)
4147
}
48+
49+
pub fn has_preceding_cjk_punctuation(&self) -> bool {
50+
self.contains(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
51+
}
52+
pub fn has_following_cjk(&self) -> bool {
53+
self.contains(TokenFlags::HAS_FOLLOWING_CJK)
54+
}
4255
}
4356

4457
#[derive(Clone, PartialEq, Eq)]

crates/intl_markdown/tests/md_extensions.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,38 @@
33
44
mod harness;
55

6+
/// Chinese and Japanese content usually do _not_ include spaces between formatted and unformatted
7+
/// segments of a single phrase, such as `**{value}**件の投稿`. But this is technically not valid
8+
/// `strong` formatting according to the CommonMark spec, since the right flank of the ending
9+
/// delimiter is a non-space Unicode character.
10+
///
11+
/// See more information in the CommonMark discussion here:
12+
/// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
13+
/// https://github.com/commonmark/cmark/pull/208
14+
///
15+
/// Because this library is explicitly intended to support many languages including most Asian
16+
/// languages, we are adding an extension to the Markdown rules to accommodate these situations.
17+
/// The following tests assert that the special cases for East Asian languages function in a
18+
/// logically-similar way to Western languages.
19+
mod asian_punctuation {
20+
use crate::harness::icu_string_test;
21+
icu_string_test!(
22+
japanese_adjacent_formatting,
23+
"**{value}**件の投稿",
24+
r#"<b>{value}</b>件の投稿"#
25+
);
26+
icu_string_test!(
27+
japanese_spaced_formatting,
28+
"**{value}** 件の投稿",
29+
r#"<b>{value}</b> 件の投稿"#
30+
);
31+
icu_string_test!(
32+
korean_western_punctuation,
33+
"*스크립트(script)*라고",
34+
r#"<i>스크립트(script)</i>라고"#
35+
);
36+
}
37+
638
mod hooks {
739
use crate::harness::ast_test;
840
ast_test!(

0 commit comments

Comments
 (0)