Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/intl_markdown/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ doctest = false

[dependencies]
bitflags = "2"
cjk = "0.2.5"
intl_markdown_macros = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
Expand Down
22 changes: 19 additions & 3 deletions crates/intl_markdown/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub(super) struct LexerState {
pub last_was_whitespace: bool,
pub last_was_punctuation: bool,
pub last_was_newline: bool,
pub last_was_cjk_punctuation: bool,
/// True if the last token was entirely an escaped token, which has an
/// effect on whether the next token is considered punctuation or not when
/// computing delimiters.
Expand All @@ -45,6 +46,7 @@ impl LexerState {
last_was_newline: true,
last_was_whitespace: true,
last_was_punctuation: false,
last_was_cjk_punctuation: false,
last_token_was_escape: false,
is_after_newline: true,
}
Expand All @@ -56,6 +58,7 @@ impl LexerState {
self.last_was_whitespace = true;
self.last_was_newline = true;
self.last_was_punctuation = false;
self.last_was_cjk_punctuation = false;
self.last_token_was_escape = false;
self.is_after_newline = true;
}
Expand Down Expand Up @@ -215,9 +218,9 @@ impl<'source> Lexer<'source> {
c if self.state.last_was_newline
&& c.is_ascii_whitespace()
&& self.state.indent_depth > 0 =>
{
self.consume_leading_whitespace()
}
{
self.consume_leading_whitespace()
}
b'\0' => self.consume_byte(SyntaxKind::EOF),
_ => self.consume_verbatim_line(),
}
Expand Down Expand Up @@ -585,6 +588,9 @@ impl<'source> Lexer<'source> {
GeneralCategoryGroup::Punctuation | GeneralCategoryGroup::Symbol
)
});
// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
// https://github.com/commonmark/cmark/pull/208
let next_is_cjk = next.map_or(false, |c| !c.is_ascii() && cjk::is_cjk_codepoint(c));
let next_is_escaped = matches!(next, Some('\\'));

let mut flags = TokenFlags::default();
Expand All @@ -594,12 +600,18 @@ impl<'source> Lexer<'source> {
if self.state.last_was_punctuation && !self.state.last_token_was_escape {
flags.insert(TokenFlags::HAS_PRECEDING_PUNCTUATION);
}
if self.state.last_was_cjk_punctuation {
flags.insert(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
}
if next_is_whitespace {
flags.insert(TokenFlags::HAS_FOLLOWING_WHITESPACE);
}
if next_is_punctuation && !next_is_escaped {
flags.insert(TokenFlags::HAS_FOLLOWING_PUNCTUATION);
}
if next_is_cjk {
flags.insert(TokenFlags::HAS_FOLLOWING_CJK);
}

self.advance();

Expand Down Expand Up @@ -1043,6 +1055,10 @@ impl<'source> Lexer<'source> {
} else {
last_char.is_ascii_punctuation()
};
// [cjk] includes all ascii characters as CJK punctuation for some reason, which we
// specifically do not want to match here, so the check is also guarded that the character
// is not plain ASCII.
self.state.last_was_cjk_punctuation = !last_char.is_ascii() && cjk::is_cjk_punctuation_codepoint(last_char);

self.state.last_was_newline = last_char == '\n';
self.state.last_was_whitespace = last_char.is_whitespace();
Expand Down
26 changes: 14 additions & 12 deletions crates/intl_markdown/src/parser/delimiter.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::ops::Range;

use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};
use crate::delimiter::Delimiter;
use crate::parser::emphasis::process_emphasis;
use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};

use super::ICUMarkdownParser;

Expand Down Expand Up @@ -59,22 +59,24 @@ pub(super) fn parse_delimiter_run(p: &mut ICUMarkdownParser, kind: SyntaxKind) -
!first_flags.has_preceding_whitespace()
// 2. Either:
&& (
// - not preceded by a punctuation. OR
!first_flags.has_preceding_punctuation()
// - not preceded by a punctuation. OR
// (CJK extension: preceding CJK punctuation is allowed)
(!first_flags.has_preceding_punctuation() || first_flags.has_preceding_cjk_punctuation())
// - preceded by punctuation but followed by whitespace or punctuation
|| (last_flags.has_following_whitespace() || last_flags.has_following_punctuation())
);
// (CJK extension: following CJK characters are allowed)
|| (last_flags.has_following_whitespace() || last_flags.has_following_punctuation() || last_flags.has_following_cjk())
);

// Left-flanking definition
// 1. Not followed by whitespace AND
let is_left_flanking = !last_flags.has_following_whitespace()
// 2. Either:
&& (
// - not followed by a punctuation. OR
!last_flags.has_following_punctuation()
// - followed by punctuation but preceded by whitespace or punctuation.
|| (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
);
// 2. Either:
&& (
// - not followed by a punctuation. OR
!last_flags.has_following_punctuation()
// - followed by punctuation but preceded by whitespace or punctuation.
|| (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
);

// Using the determined flanking and context flags and the `kind` of the
// token, determine if it can be used to open and/or close emphasis.
Expand Down
13 changes: 13 additions & 0 deletions crates/intl_markdown/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ bitflags! {
// Only used for some delimiters currently. `ESCAPED` kinds will also
// always have this set.
const IS_ESCAPED = 1 << 6;

// Extension for support delimiters around CJK script characters.
// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
// https://github.com/commonmark/cmark/pull/208
const HAS_PRECEDING_CJK_PUNCTUATION = 1 << 5;
const HAS_FOLLOWING_CJK = 1 << 6;
}
}

Expand All @@ -39,6 +45,13 @@ impl TokenFlags {
pub fn is_escaped(&self) -> bool {
self.contains(TokenFlags::IS_ESCAPED)
}

pub fn has_preceding_cjk_punctuation(&self) -> bool {
self.contains(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
}
pub fn has_following_cjk(&self) -> bool {
self.contains(TokenFlags::HAS_FOLLOWING_CJK)
}
}

#[derive(Clone, PartialEq, Eq)]
Expand Down
32 changes: 32 additions & 0 deletions crates/intl_markdown/tests/md_extensions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,38 @@

mod harness;

/// Chinese and Japanese content usually do _not_ include spaces between formatted and unformatted
/// segments of a single phrase, such as `**{value}**件の投稿`. But this is technically not valid
/// `strong` formatting according to the CommonMark spec, since the right flank of the ending
/// delimiter is a non-space Unicode character.
///
/// See more information in the CommonMark discussion here:
/// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
/// https://github.com/commonmark/cmark/pull/208
///
/// Because this library is explicitly intended to support many languages including most Asian
/// languages, we are adding an extension to the Markdown rules to accommodate these situations.
/// The following tests assert that the special cases for East Asian languages function in a
/// logically-similar way to Western languages.
mod asian_punctuation {
use crate::harness::icu_string_test;
icu_string_test!(
japanese_adjacent_formatting,
"**{value}**件の投稿",
r#"<b>{value}</b>件の投稿"#
);
icu_string_test!(
japanese_spaced_formatting,
"**{value}** 件の投稿",
r#"<b>{value}</b> 件の投稿"#
);
icu_string_test!(
korean_western_punctuation,
"*스크립트(script)*라고",
r#"<i>스크립트(script)</i>라고"#
);
}

mod hooks {
use crate::harness::ast_test;
ast_test!(
Expand Down