Skip to content

Commit 902eea8

Browse files
authored
feat: implement ngram tokenizer with token_chars and custom_token_chars [2.6] (#45046)
pr: #45040 issue: #45039 Signed-off-by: SpadeA <[email protected]>
1 parent 1be0dcf commit 902eea8

File tree

5 files changed

+778
-0
lines changed

5 files changed

+778
-0
lines changed

internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ url = "2.5.4"
4848
prost = "0.13.5"
4949
once_cell = "1.20.3"
5050
unicode-general-category = "1.0.0"
51+
unicode_categories = "0.1.1"
5152

5253
# lindera dependencies for fetch and prepare dictionary online.
5354
tokio = { version = "1.45.0", features = [

internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ mod icu_tokneizer;
44
mod jieba_tokenizer;
55
mod lang_ident_tokenizer;
66
mod lindera_tokenizer;
7+
mod ngram_tokenizer_with_chars;
78
mod tokenizer;
89

910
pub use self::char_group_tokenizer::CharGroupTokenizer;
@@ -12,5 +13,6 @@ pub use self::icu_tokneizer::IcuTokenizer;
1213
pub use self::jieba_tokenizer::JiebaTokenizer;
1314
pub use self::lang_ident_tokenizer::LangIdentTokenizer;
1415
pub use self::lindera_tokenizer::LinderaTokenizer;
16+
pub use self::ngram_tokenizer_with_chars::NgramTokenizerWithChars;
1517

1618
pub(crate) use self::tokenizer::*;

0 commit comments

Comments
 (0)