discord · faultyserver · Dec 19, 2024 · Dec 19, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/intl_markdown/Cargo.toml b/crates/intl_markdown/Cargo.toml
@@ -13,6 +13,7 @@ doctest = false
 
 [dependencies]
 bitflags = "2"
+cjk = "0.2.5"
 intl_markdown_macros = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }

diff --git a/crates/intl_markdown/src/lexer.rs b/crates/intl_markdown/src/lexer.rs
@@ -23,6 +23,7 @@ pub(super) struct LexerState {
     pub last_was_whitespace: bool,
     pub last_was_punctuation: bool,
     pub last_was_newline: bool,
+    pub last_was_cjk_punctuation: bool,
     /// True if the last token was entirely an escaped token, which has an
     /// effect on whether the next token is considered punctuation or not when
     /// computing delimiters.
@@ -45,6 +46,7 @@ impl LexerState {
             last_was_newline: true,
             last_was_whitespace: true,
             last_was_punctuation: false,
+            last_was_cjk_punctuation: false,
             last_token_was_escape: false,
             is_after_newline: true,
         }
@@ -56,6 +58,7 @@ impl LexerState {
         self.last_was_whitespace = true;
         self.last_was_newline = true;
         self.last_was_punctuation = false;
+        self.last_was_cjk_punctuation = false;
         self.last_token_was_escape = false;
         self.is_after_newline = true;
     }
@@ -215,9 +218,9 @@ impl<'source> Lexer<'source> {
             c if self.state.last_was_newline
                 && c.is_ascii_whitespace()
                 && self.state.indent_depth > 0 =>
-            {
-                self.consume_leading_whitespace()
-            }
+                {
+                    self.consume_leading_whitespace()
+                }
             b'\0' => self.consume_byte(SyntaxKind::EOF),
             _ => self.consume_verbatim_line(),
         }
@@ -585,6 +588,9 @@ impl<'source> Lexer<'source> {
                 GeneralCategoryGroup::Punctuation | GeneralCategoryGroup::Symbol
             )
         });
+        // https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
+        // https://github.com/commonmark/cmark/pull/208
+        let next_is_cjk = next.map_or(false, |c| !c.is_ascii() && cjk::is_cjk_codepoint(c));
         let next_is_escaped = matches!(next, Some('\\'));
 
         let mut flags = TokenFlags::default();
@@ -594,12 +600,18 @@ impl<'source> Lexer<'source> {
         if self.state.last_was_punctuation && !self.state.last_token_was_escape {
             flags.insert(TokenFlags::HAS_PRECEDING_PUNCTUATION);
         }
+        if self.state.last_was_cjk_punctuation {
+            flags.insert(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
+        }
         if next_is_whitespace {
             flags.insert(TokenFlags::HAS_FOLLOWING_WHITESPACE);
         }
         if next_is_punctuation && !next_is_escaped {
             flags.insert(TokenFlags::HAS_FOLLOWING_PUNCTUATION);
         }
+        if next_is_cjk {
+            flags.insert(TokenFlags::HAS_FOLLOWING_CJK);
+        }
 
         self.advance();
 
@@ -1043,6 +1055,10 @@ impl<'source> Lexer<'source> {
         } else {
             last_char.is_ascii_punctuation()
         };
+        // [cjk] includes all ascii characters as CJK punctuation for some reason, which we
+        // specifically do not want to match here, so the check is also guarded that the character
+        // is not plain ASCII.
+        self.state.last_was_cjk_punctuation = !last_char.is_ascii() && cjk::is_cjk_punctuation_codepoint(last_char);
 
         self.state.last_was_newline = last_char == '\n';
         self.state.last_was_whitespace = last_char.is_whitespace();

diff --git a/crates/intl_markdown/src/parser/delimiter.rs b/crates/intl_markdown/src/parser/delimiter.rs
@@ -1,8 +1,8 @@
 use std::ops::Range;
 
-use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};
 use crate::delimiter::Delimiter;
 use crate::parser::emphasis::process_emphasis;
+use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};
 
 use super::ICUMarkdownParser;
 
@@ -59,22 +59,24 @@ pub(super) fn parse_delimiter_run(p: &mut ICUMarkdownParser, kind: SyntaxKind) -
         !first_flags.has_preceding_whitespace()
             // 2. Either:
             && (
-                // - not preceded by a punctuation. OR
-                !first_flags.has_preceding_punctuation()
+            // - not preceded by a punctuation. OR
+            // (CJK extension: preceding CJK punctuation is allowed)
+            (!first_flags.has_preceding_punctuation() || first_flags.has_preceding_cjk_punctuation())
                 // - preceded by punctuation but followed by whitespace or punctuation
-                || (last_flags.has_following_whitespace() || last_flags.has_following_punctuation())
-            );
+                // (CJK extension: following CJK characters are allowed)
+                || (last_flags.has_following_whitespace() || last_flags.has_following_punctuation() || last_flags.has_following_cjk())
+        );
 
     // Left-flanking definition
     // 1. Not followed by whitespace AND
     let is_left_flanking = !last_flags.has_following_whitespace()
-            // 2. Either:
-            && (
-                // - not followed by a punctuation. OR
-                !last_flags.has_following_punctuation()
-                // - followed by punctuation but preceded by whitespace or punctuation.
-                || (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
-            );
+        // 2. Either:
+        && (
+        // - not followed by a punctuation. OR
+        !last_flags.has_following_punctuation()
+            // - followed by punctuation but preceded by whitespace or punctuation.
+            || (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
+    );
 
     // Using the determined flanking and context flags and the `kind` of the
     // token, determine if it can be used to open and/or close emphasis.

diff --git a/crates/intl_markdown/src/token.rs b/crates/intl_markdown/src/token.rs
@@ -19,6 +19,12 @@ bitflags! {
         // Only used for some delimiters currently. `ESCAPED` kinds will also
         // always have this set.
         const IS_ESCAPED = 1 << 6;
+
+        // Extension for support delimiters around CJK script characters.
+        // https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
+        // https://github.com/commonmark/cmark/pull/208
+        const HAS_PRECEDING_CJK_PUNCTUATION = 1 << 5;
+        const HAS_FOLLOWING_CJK = 1 << 6;
     }
 }
 
@@ -39,6 +45,13 @@ impl TokenFlags {
     pub fn is_escaped(&self) -> bool {
         self.contains(TokenFlags::IS_ESCAPED)
     }
+
+    pub fn has_preceding_cjk_punctuation(&self) -> bool {
+        self.contains(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
+    }
+    pub fn has_following_cjk(&self) -> bool {
+        self.contains(TokenFlags::HAS_FOLLOWING_CJK)
+    }
 }
 
 #[derive(Clone, PartialEq, Eq)]

diff --git a/crates/intl_markdown/tests/md_extensions.rs b/crates/intl_markdown/tests/md_extensions.rs
@@ -3,6 +3,38 @@
 
 mod harness;
 
+/// Chinese and Japanese content usually do _not_ include spaces between formatted and unformatted
+/// segments  of a single phrase, such as `**{value}**件の投稿`. But this is technically not valid
+/// `strong`  formatting according to the CommonMark spec, since the right flank of the ending
+/// delimiter is a non-space Unicode character.
+///
+/// See more information in the CommonMark discussion here:
+/// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
+/// https://github.com/commonmark/cmark/pull/208
+///
+/// Because this library is explicitly intended to support many languages including most Asian
+/// languages, we are adding an extension to the Markdown rules to accommodate these situations.
+/// The following tests assert that the special cases for East Asian languages function in a
+/// logically-similar way to Western languages.
+mod asian_punctuation {
+    use crate::harness::icu_string_test;
+    icu_string_test!(
+        japanese_adjacent_formatting,
+        "**{value}**件の投稿",
+        r#"<b>{value}</b>件の投稿"#
+    );
+    icu_string_test!(
+        japanese_spaced_formatting,
+        "**{value}** 件の投稿",
+        r#"<b>{value}</b> 件の投稿"#
+    );
+    icu_string_test!(
+        korean_western_punctuation,
+        "*스크립트(script)*라고",
+        r#"<i>스크립트(script)</i>라고"#
+    );
+}
+
 mod hooks {
     use crate::harness::ast_test;
     ast_test!(