Skip to content

Commit 1a07a3a

Browse files
author
takaoka_k
committed
Merge branch 'develop' into release/6.8
2 parents bf5a9ce + 2b1157d commit 1a07a3a

22 files changed

+1073
-644
lines changed

README.md

Lines changed: 225 additions & 151 deletions
Large diffs are not rendered by default.

docs/tutorial.md

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
# Elasticsearch用Sudachiプラグイン チュートリアル
2+
3+
Elasticsearch プラグインは 5.6, 6.8 の最新バージョンと7系の各マイナーバージョンをサポートしています。
4+
5+
以下では Elasticsearch 7.5.0 で Sudachi をつかう手順をしめします。
6+
7+
まずプラグインをインストールします。
8+
9+
```
10+
$ sudo elasticsearch-plugin install https://github.com/WorksApplications/elasticsearch-sudachi/releases/download/v7.5.0-1.3.2/analysis-sudachi-elasticsearch7.5-1.3.2.zip
11+
```
12+
13+
パッケージには辞書が含まれていません。https://github.com/WorksApplications/SudachiDict から最新の辞書を取得し、 `$ES_HOME/sudachi` の下に置きます。 3つの辞書のうち以下では core 辞書を利用します。
14+
15+
```
16+
$ wget https://object-storage.tyo2.conoha.io/v1/nc_2520839e1f9641b08211a5c85243124a/sudachi/sudachi-dictionary-20191030-core.zip
17+
$ unzip sudachi-dictionary-20191030-core.zip
18+
$ sudo mkdir /etc/elasticsearch/sudachi
19+
$ sudo cp sudachi-dictionary-20191030/system_core.dic /etc/elasticsearch/sudachi
20+
```
21+
22+
配置後、Elasticsearch を再起動します。
23+
24+
設定ファイルを作成します。
25+
26+
```json:analysis_sudachi.json
27+
{
28+
"settings" : {
29+
"analysis" : {
30+
"filter" : {
31+
"romaji_readingform" : {
32+
"type" : "sudachi_readingform",
33+
"use_romaji" : true
34+
},
35+
"katakana_readingform" : {
36+
"type" : "sudachi_readingform",
37+
"use_romaji" : false
38+
}
39+
},
40+
"analyzer" : {
41+
"sudachi_baseform_analyzer" : {
42+
"filter" : [ "sudachi_baseform" ],
43+
"type" : "custom",
44+
"tokenizer" : "sudachi_tokenizer"
45+
},
46+
"sudachi_normalizedform_analyzer" : {
47+
"filter" : [ "sudachi_normalizedform" ],
48+
"type" : "custom",
49+
"tokenizer" : "sudachi_tokenizer"
50+
},
51+
"sudachi_readingform_analyzer" : {
52+
"filter" : [ "katakana_readingform" ],
53+
"type" : "custom",
54+
"tokenizer" : "sudachi_tokenizer"
55+
},
56+
"sudachi_romaji_analyzer" : {
57+
"filter" : [ "romaji_readingform" ],
58+
"type" : "custom",
59+
"tokenizer" : "sudachi_tokenizer"
60+
},
61+
"sudachi_analyzer": {
62+
"filter": [],
63+
"tokenizer": "sudachi_tokenizer",
64+
"type": "custom"
65+
}
66+
},
67+
"tokenizer" : {
68+
"sudachi_tokenizer": {
69+
"type": "sudachi_tokenizer",
70+
"mode": "search",
71+
"resources_path": "/etc/elasticsearch/config/sudachi"
72+
}
73+
}
74+
}
75+
}
76+
}
77+
```
78+
79+
インデックスを作成します。
80+
81+
```
82+
$ curl -X PUT 'localhost:9200/test_sudachi' -H 'Content-Type: application/json' -d @analysis_sudachi.json
83+
{"acknowledged":true,"shards_acknowledged":true,"index":"test_sudachi"}
84+
```
85+
86+
解析してみます。
87+
88+
```
89+
$ curl -X GET "localhost:9200/test_sudachi/_analyze?pretty" -H 'Content-Type: application/json' -d'{"analyzer":"sudachi_analyzer", "text" : "関西国際空港"}'
90+
{
91+
"tokens" : [
92+
{
93+
"token" : "関西国際空港",
94+
"start_offset" : 0,
95+
"end_offset" : 6,
96+
"type" : "word",
97+
"position" : 0,
98+
"positionLength" : 3
99+
},
100+
{
101+
"token" : "関西",
102+
"start_offset" : 0,
103+
"end_offset" : 2,
104+
"type" : "word",
105+
"position" : 0
106+
},
107+
{
108+
"token" : "国際",
109+
"start_offset" : 2,
110+
"end_offset" : 4,
111+
"type" : "word",
112+
"position" : 1
113+
},
114+
{
115+
"token" : "空港",
116+
"start_offset" : 4,
117+
"end_offset" : 6,
118+
"type" : "word",
119+
"position" : 2
120+
}
121+
]
122+
}
123+
```
124+
125+
`search mode` が指定されているでA単位とC単位の両方が出力されます。
126+
127+
動詞、形容詞を終止形で出力してみます。
128+
129+
```
130+
$ curl -X GET "localhost:9200/test_sudachi/_analyze?pretty" -H 'Content-Type: application/json' -d'{"analyzer":"sudachi_baseform_analyzer", "text" : "おおきく"}'
131+
{
132+
"tokens" : [
133+
{
134+
"token" : "おおきい",
135+
"start_offset" : 0,
136+
"end_offset" : 4,
137+
"type" : "word",
138+
"position" : 0
139+
}
140+
]
141+
}
142+
```
143+
144+
表記を正規化して出力してみます。
145+
146+
```
147+
$ curl -X GET "localhost:9200/test_sudachi/_analyze?pretty" -H 'Content-Type: application/json' -d'{"analyzer":"sudachi_normalizedform_analyzer", "text" : "おおきく"}'
148+
{
149+
"tokens" : [
150+
{
151+
"token" : "大きい",
152+
"start_offset" : 0,
153+
"end_offset" : 4,
154+
"type" : "word",
155+
"position" : 0
156+
}
157+
]
158+
}
159+
```
160+
161+
読みを出力してみます。
162+
163+
```
164+
$ curl -X GET "localhost:9200/test_sudachi/_analyze?pretty" -H 'Content-Type: application/json' -d'{"analyzer":"sudachi_readingform_analyzer", "text" : "おおきく"}'
165+
{
166+
"tokens" : [
167+
{
168+
"token" : "オオキク",
169+
"start_offset" : 0,
170+
"end_offset" : 4,
171+
"type" : "word",
172+
"position" : 0
173+
}
174+
]
175+
}
176+
```
177+
178+
読みをローマ字 (Microsoft IME 風) で出力してみます。
179+
180+
```
181+
$ curl -X GET "localhost:9200/test_sudachi/_analyze?pretty" -H 'Content-Type: application/json' -d'{"analyzer":"sudachi_romaji_analyzer", "text" : "おおきく"}'
182+
{
183+
"tokens" : [
184+
{
185+
"token" : "ookiku",
186+
"start_offset" : 0,
187+
"end_offset" : 4,
188+
"type" : "word",
189+
"position" : 0
190+
}
191+
]
192+
}
193+
```
194+
195+
そのほか、品詞によるトークンの除外、ストップワードなどが利用できます。
196+
197+
こちらもご参照ください: [Elasticsearchのための新しい形態素解析器 「Sudachi」 - Qiita](https://qiita.com/sorami/items/99604ef105f13d2d472b) (Elastic stack Advent Calendar 2017)

pom.xml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,17 @@
33

44
<groupId>com.worksap.nlp</groupId>
55
<artifactId>analysis-sudachi-elasticsearch6.8</artifactId>
6-
<version>1.3.2</version>
6+
<version>2.0.0</version>
77
<packaging>jar</packaging>
88

99
<name>analysis-sudachi</name>
1010

1111
<properties>
1212
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
1313
<java.version>1.8</java.version>
14-
<elasticsearch.version>6.8.5</elasticsearch.version>
15-
<lucene.version>7.7.2</lucene.version>
16-
<sudachi.version>0.3.1</sudachi.version>
14+
<elasticsearch.version>6.8.9</elasticsearch.version>
15+
<lucene.version>7.7.3</lucene.version>
16+
<sudachi.version>0.4.1</sudachi.version>
1717
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
1818
<sonar.language>java</sonar.language>
1919
<sonar.organization>worksapplications</sonar.organization>
@@ -153,4 +153,4 @@
153153
<developerConnection>scm:git:[email protected]:WorksApplications/elasticsearch-sudachi.git</developerConnection>
154154
<url>https://github.com/WorksApplications/elasticsearch-sudachi</url>
155155
</scm>
156-
</project>
156+
</project>

src/main/assemblies/plugin.xml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
<source>src/main/extras/plugin-descriptor.properties</source>
1010
<filtered>true</filtered>
1111
</file>
12+
<file>
13+
<source>LICENSE</source>
14+
<filtered>false</filtered>
15+
</file>
1216
</files>
1317
<dependencySets>
1418
<dependencySet>

src/main/java/com/worksap/nlp/elasticsearch/sudachi/index/SudachiAnalyzerProvider.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import org.elasticsearch.index.analysis.Analysis;
2828

2929
import com.worksap.nlp.lucene.sudachi.ja.SudachiAnalyzer;
30-
import com.worksap.nlp.lucene.sudachi.ja.SudachiTokenizer;
30+
import com.worksap.nlp.sudachi.Tokenizer.SplitMode;
3131

3232
public class SudachiAnalyzerProvider extends
3333
AbstractIndexAnalyzerProvider<SudachiAnalyzer> {
@@ -39,10 +39,10 @@ public SudachiAnalyzerProvider(IndexSettings indexSettings,
3939
super(indexSettings, name, settings);
4040
final Set<?> stopWords = Analysis.parseStopWords(env, settings,
4141
SudachiAnalyzer.getDefaultStopSet(), false);
42-
final SudachiTokenizer.Mode mode = SudachiTokenizerFactory
42+
final SplitMode mode = SudachiTokenizerFactory
4343
.getMode(settings);
4444
final String resourcesPath = new SudachiPathResolver(env.configFile()
45-
.toString(), settings.get("resources_path", name))
45+
.toString(), settings.get("resources_path", "sudachi"))
4646
.resolvePathForDirectory();
4747
final String settingsPath = new SudachiSettingsReader(env.configFile()
4848
.toString(), settings.get("settings_path")).read();
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Copyright (c) 2020 Works Applications Co., Ltd.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.worksap.nlp.elasticsearch.sudachi.index;
18+
19+
import java.util.Locale;
20+
21+
import com.worksap.nlp.lucene.sudachi.ja.SudachiSplitFilter;
22+
import com.worksap.nlp.lucene.sudachi.ja.SudachiSplitFilter.Mode;
23+
24+
import org.apache.lucene.analysis.TokenStream;
25+
import org.elasticsearch.common.settings.Settings;
26+
import org.elasticsearch.env.Environment;
27+
import org.elasticsearch.index.IndexSettings;
28+
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
29+
30+
public class SudachiSplitFilterFactory extends AbstractTokenFilterFactory {
31+
32+
private static final String MODE_PARAM = "mode";
33+
34+
private final Mode mode;
35+
36+
public SudachiSplitFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
37+
super(indexSettings, name, settings);
38+
mode = Mode.valueOf(settings.get(MODE_PARAM, SudachiSplitFilter.DEFAULT_MODE.toString()).toUpperCase(Locale.ROOT));
39+
}
40+
41+
@Override
42+
public TokenStream create(TokenStream tokenStream) {
43+
return new SudachiSplitFilter(tokenStream, mode);
44+
}
45+
}

src/main/java/com/worksap/nlp/elasticsearch/sudachi/index/SudachiTokenizerFactory.java

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,13 @@
2626
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
2727

2828
import com.worksap.nlp.lucene.sudachi.ja.SudachiTokenizer;
29-
import com.worksap.nlp.lucene.sudachi.ja.SudachiTokenizer.Mode;
29+
import com.worksap.nlp.sudachi.Tokenizer.SplitMode;
3030

3131
public class SudachiTokenizerFactory extends AbstractTokenizerFactory {
32-
private final Mode mode;
32+
private static final String SPLIT_MODE_PARAM = "split_mode";
33+
private static final String MODE_PARAM = "mode";
34+
35+
private final SplitMode mode;
3336
private final boolean discardPunctuation;
3437
private final String resourcesPath;
3538
private final String settingsPath;
@@ -41,23 +44,28 @@ public SudachiTokenizerFactory(IndexSettings indexSettings,
4144
mode = getMode(settings);
4245
discardPunctuation = settings.getAsBoolean("discard_punctuation", true);
4346
resourcesPath = new SudachiPathResolver(env.configFile().toString(),
44-
settings.get("resources_path", name)).resolvePathForDirectory();
47+
settings.get("resources_path", "sudachi")).resolvePathForDirectory();
4548
settingsPath = new SudachiSettingsReader(env.configFile().toString(),
4649
settings.get("settings_path")).read();
4750
}
4851

49-
public static SudachiTokenizer.Mode getMode(Settings settings) {
50-
SudachiTokenizer.Mode mode = SudachiTokenizer.DEFAULT_MODE;
51-
String modeSetting = settings.get("mode", null);
52+
public static SplitMode getMode(Settings settings) {
53+
SplitMode mode = SudachiTokenizer.DEFAULT_MODE;
54+
String modeSetting = settings.get(SPLIT_MODE_PARAM, null);
5255
if (modeSetting != null) {
53-
if ("search".equalsIgnoreCase(modeSetting)) {
54-
mode = SudachiTokenizer.Mode.SEARCH;
55-
} else if ("normal".equalsIgnoreCase(modeSetting)) {
56-
mode = SudachiTokenizer.Mode.NORMAL;
57-
} else if ("extended".equalsIgnoreCase(modeSetting)) {
58-
mode = SudachiTokenizer.Mode.EXTENDED;
56+
if ("a".equalsIgnoreCase(modeSetting)) {
57+
mode = SplitMode.A;
58+
} else if ("b".equalsIgnoreCase(modeSetting)) {
59+
mode = SplitMode.B;
60+
} else if ("c".equalsIgnoreCase(modeSetting)) {
61+
mode = SplitMode.C;
5962
}
6063
}
64+
65+
if (settings.hasValue(MODE_PARAM)) {
66+
throw new IllegalArgumentException(MODE_PARAM + " is duprecated, use SudachiSplitFilter");
67+
}
68+
6169
return mode;
6270
}
6371

src/main/java/com/worksap/nlp/elasticsearch/sudachi/plugin/AnalysisSudachiPlugin.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import com.worksap.nlp.elasticsearch.sudachi.index.SudachiPartOfSpeechFilterFactory;
3636
import com.worksap.nlp.elasticsearch.sudachi.index.SudachiReadingFormFilterFactory;
3737
import com.worksap.nlp.elasticsearch.sudachi.index.SudachiStopTokenFilterFactory;
38+
import com.worksap.nlp.elasticsearch.sudachi.index.SudachiSplitFilterFactory;
3839
import com.worksap.nlp.elasticsearch.sudachi.index.SudachiTokenizerFactory;
3940

4041
public class AnalysisSudachiPlugin extends Plugin implements AnalysisPlugin {
@@ -46,6 +47,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
4647
extra.put("sudachi_part_of_speech",
4748
SudachiPartOfSpeechFilterFactory::new);
4849
extra.put("sudachi_readingform", SudachiReadingFormFilterFactory::new);
50+
extra.put("sudachi_split", SudachiSplitFilterFactory::new);
4951
extra.put("sudachi_ja_stop", SudachiStopTokenFilterFactory::new);
5052
return extra;
5153
}

0 commit comments

Comments
 (0)