Skip to content

Commit 504af4c

Browse files
authored
feat: support ngram token filter (#10)
* feat: support ngram token filter Signed-off-by: Mingzhuo Yin <yinmingzhuo@gmail.com> * bump sccacche-action version Signed-off-by: Mingzhuo Yin <yinmingzhuo@gmail.com> * update tools/setup.sh Signed-off-by: Mingzhuo Yin <yinmingzhuo@gmail.com> * bump lindera version Signed-off-by: Mingzhuo Yin <yinmingzhuo@gmail.com> --------- Signed-off-by: Mingzhuo Yin <yinmingzhuo@gmail.com>
1 parent 724d7c4 commit 504af4c

File tree

8 files changed

+121
-10
lines changed

8 files changed

+121
-10
lines changed

.github/workflows/check.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ on:
88
- "assets/**"
99
- "src/**"
1010
- "tests/**"
11+
- "tools/**"
1112
- ".taplo.toml"
1213
- ".typos.toml"
1314
- "Cargo.lock"
@@ -68,7 +69,7 @@ jobs:
6869
- name: Set up Environment
6970
run: ./tools/setup.sh ${{ matrix.version }}
7071
- name: Set up Sccache
71-
uses: mozilla-actions/sccache-action@v0.0.7
72+
uses: mozilla-actions/sccache-action@v0.0.9
7273
- name: Clippy
7374
run: cargo clippy --features pg${{ matrix.version }}
7475
- name: Unit Test

Cargo.toml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ name = "pgrx_embed_pg_tokenizer"
1111
path = "./src/bin/pgrx_embed.rs"
1212

1313
[features]
14-
pg12 = ["pgrx/pg12", "pgrx-tests/pg12"]
1514
pg13 = ["pgrx/pg13", "pgrx-tests/pg13"]
1615
pg14 = ["pgrx/pg14", "pgrx-tests/pg14"]
1716
pg15 = ["pgrx/pg15", "pgrx-tests/pg15"]
@@ -28,8 +27,8 @@ lindera-cc-cedict = ["lindera/cc-cedict"]
2827
anyhow = "1.0.97"
2928
dashmap = "6.1.0"
3029
jieba-rs = "0.7.2"
31-
lindera = "0.37.0"
32-
pgrx = "=0.13.1"
30+
lindera = "0.42.2"
31+
pgrx = "=0.14.1"
3332
regex = "1.11.1"
3433
rust-stemmers = { git = "https://github.com/tensorchord/rust-stemmers.git", rev = "51696378e352688b7ffd4fface615370ff5e8768" }
3534
serde = { version = "1.0.218", features = ["derive"] }
@@ -42,10 +41,7 @@ unicode-segmentation = "1.12.0"
4241
validator = { version = "0.20.0", features = ["derive"] }
4342

4443
[dev-dependencies]
45-
pgrx-tests = "=0.13.1"
46-
47-
[patch.crates-io]
48-
pgrx = { git = "https://github.com/tensorchord/pgrx", branch = "patch-to-pg_tokenizer" }
44+
pgrx-tests = "=0.14.1"
4945

5046
[profile.release]
5147
opt-level = 3

docs/00-reference.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,22 @@ You can choose only one of the above options for each character filter.
9292
| stopwords | String | Stopwords name, builtin: `lucene_english`, `nltk_english`, `iso_english` |
9393
| synonym | String | Synonym name |
9494
| pg_dict | String | Using [postgres text search dictionary](https://www.postgresql.org/docs/current/textsearch-dictionaries.html). We currently support all dictionaries except `Thesaurus Dictionary`. |
95+
| ngram | Table | N-gram tokenizer, see [Options for `ngram`](#options-for-ngram) |
9596

9697
You can choose only one of the above options for each token filter.
9798

9899
#### Supported values for `stemmer`
99100

100101
arabic, armenian, basque, catalan, danish, dutch, english_porter, english_porter2, estonian, finnish, french, german, greek, hindi, hungarian, indonesian, irish, italian, lithuanian, nepali, norwegian, portuguese, romanian, russian, serbian, spanish, swedish, tamil, turkish, yiddish
101102

103+
#### Options for `ngram`
104+
105+
| Key | Type | Description |
106+
| ----------------- | ------- | -------------------------------------------------------- |
107+
| max_gram | Integer | Maximum n-gram size, range: `1..=255`, default: `2` |
108+
| min_gram | Integer | Minimum n-gram size, range: `1..=255`, default: `1` |
109+
| preserve_original | Boolean | Whether to preserve the original token, default: `false` |
110+
102111
### Options for `tokenizer`
103112

104113
| Key | Type | Description |

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ compile_error!("Target is not supported.");
1515
compiler_error!("PostgreSQL version must be selected.");
1616

1717
#[pgrx::pg_guard]
18-
unsafe extern "C" fn _PG_init() {
18+
extern "C-unwind" fn _PG_init() {
1919
if unsafe { pgrx::pg_sys::IsUnderPostmaster } {
2020
pgrx::error!("pg_tokenizer must be loaded via shared_preload_libraries.");
2121
}

src/token_filter/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
mod ngram;
12
mod pg_dict;
23
mod skip_non_alphanumeric;
34
mod stemmer;
@@ -6,6 +7,7 @@ mod synonym;
67

78
use std::sync::Arc;
89

10+
use ngram::{Ngram, NgramConfig};
911
use pg_dict::PgDictTokenFilter;
1012
use serde::{Deserialize, Serialize};
1113
use skip_non_alphanumeric::SkipNonAlphanumeric;
@@ -32,6 +34,8 @@ pub enum TokenFilterConfig {
3234
Stopwords(String),
3335
PgDict(String),
3436
Synonym(String),
37+
#[serde(rename = "ngram")]
38+
NGram(NgramConfig),
3539
}
3640

3741
pub fn get_token_filter(config: TokenFilterConfig) -> TokenFilterPtr {
@@ -41,5 +45,6 @@ pub fn get_token_filter(config: TokenFilterConfig) -> TokenFilterPtr {
4145
TokenFilterConfig::Stopwords(name) => stopwords::get_stopwords_token_filter(&name),
4246
TokenFilterConfig::PgDict(name) => Arc::new(PgDictTokenFilter::new(&name)),
4347
TokenFilterConfig::Synonym(name) => synonym::get_synonym_token_filter(&name),
48+
TokenFilterConfig::NGram(config) => Arc::new(Ngram::new(config)),
4449
}
4550
}

src/token_filter/ngram.rs

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
use serde::{Deserialize, Serialize};
2+
use validator::{Validate, ValidationError};
3+
4+
use super::TokenFilter;
5+
6+
#[derive(Clone, Debug, Serialize, Deserialize, Validate)]
7+
#[serde(rename_all = "snake_case")]
8+
#[serde(deny_unknown_fields)]
9+
#[validate(schema(function = "NgramConfig::validate_grams"))]
10+
pub struct NgramConfig {
11+
#[serde(default = "NgramConfig::default_max_gram")]
12+
#[validate(range(min = 1, max = 255))]
13+
pub max_gram: usize,
14+
#[serde(default = "NgramConfig::default_min_gram")]
15+
#[validate(range(min = 1, max = 255))]
16+
pub min_gram: usize,
17+
#[serde(default = "NgramConfig::default_preserve_original")]
18+
pub preserve_original: bool,
19+
}
20+
21+
impl NgramConfig {
22+
fn default_max_gram() -> usize {
23+
2
24+
}
25+
fn default_min_gram() -> usize {
26+
1
27+
}
28+
fn default_preserve_original() -> bool {
29+
false
30+
}
31+
fn validate_grams(&self) -> Result<(), ValidationError> {
32+
if self.min_gram > self.max_gram {
33+
return Err(ValidationError::new(
34+
"min_gram must be less than or equal to max_gram",
35+
));
36+
}
37+
Ok(())
38+
}
39+
}
40+
41+
pub struct Ngram {
42+
config: NgramConfig,
43+
}
44+
45+
impl TokenFilter for Ngram {
46+
fn apply(&self, token: String) -> Vec<String> {
47+
let mut results = Vec::new();
48+
let len = token.len();
49+
for i in 0..=(len - self.config.min_gram) {
50+
for j in (i + self.config.min_gram)..=(i + self.config.max_gram).min(len) {
51+
results.push(token[i..j].to_string());
52+
}
53+
}
54+
if self.config.preserve_original
55+
&& !(self.config.min_gram..=self.config.max_gram).contains(&len)
56+
{
57+
results.push(token);
58+
}
59+
results
60+
}
61+
}
62+
63+
impl Ngram {
64+
pub fn new(config: NgramConfig) -> Self {
65+
if let Err(e) = config.validate() {
66+
panic!("Invalid NgramConfig: {}", e);
67+
}
68+
69+
Ngram { config }
70+
}
71+
}

tests/sqllogictest/ngram.slt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
statement ok
2+
BEGIN;
3+
4+
statement ok
5+
SELECT tokenizer_catalog.create_text_analyzer('test_ngram', $$
6+
pre_tokenizer = "unicode_segmentation"
7+
[[token_filters]]
8+
[token_filters.ngram]
9+
$$);
10+
11+
query T
12+
SELECT tokenizer_catalog.apply_text_analyzer('Quick fox', 'test_ngram');
13+
----
14+
{Q,Qu,u,ui,i,ic,c,ck,k,f,fo,o,ox,x}
15+
16+
statement ok
17+
SELECT tokenizer_catalog.create_text_analyzer('test_ngram2', $$
18+
pre_tokenizer = "unicode_segmentation"
19+
[[token_filters]]
20+
[token_filters.ngram]
21+
max_gram = 3
22+
min_gram = 2
23+
preserve_original = true
24+
$$);
25+
26+
query T
27+
SELECT tokenizer_catalog.apply_text_analyzer('Quick fox', 'test_ngram2');
28+
----
29+
{Qu,Qui,ui,uic,ic,ick,ck,Quick,fo,fox,ox}

tools/setup.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ sudo -iu postgres createdb -O $USER $USER
1919
sudo -iu postgres psql -c 'ALTER SYSTEM SET shared_preload_libraries = "pg_tokenizer.so"'
2020
sudo systemctl stop postgresql
2121

22-
curl -fsSL https://github.com/tensorchord/pgrx/releases/download/v0.13.1/cargo-pgrx-v0.13.1-$(uname -m)-unknown-linux-gnu.tar.gz | tar -xOzf - ./cargo-pgrx | install -m 755 /dev/stdin /usr/local/bin/cargo-pgrx
22+
curl -fsSL https://github.com/tensorchord/pgrx/releases/download/v0.14.1/cargo-pgrx-v0.14.1-$(uname -m)-unknown-linux-gnu.tar.gz | tar -xOzf - ./cargo-pgrx | install -m 755 /dev/stdin /usr/local/bin/cargo-pgrx
2323
cargo pgrx init --pg${version}=$(which pg_config)
2424

2525
curl -fsSL https://github.com/risinglightdb/sqllogictest-rs/releases/download/v0.26.4/sqllogictest-bin-v0.26.4-$(uname -m)-unknown-linux-musl.tar.gz | tar -xOzf - ./sqllogictest | install -m 755 /dev/stdin /usr/local/bin/sqllogictest

0 commit comments

Comments
 (0)