Skip to content

Commit 1931407

Browse files
committed
release 6.4.3-1.3.0
1 parent 140741d commit 1931407

File tree

8 files changed

+192
-376
lines changed

8 files changed

+192
-376
lines changed

.gitignore

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,14 @@ buildNumber.properties
1212
!/.mvn/wrapper/maven-wrapper.jar
1313

1414
src/test/resources/com/worksap/nlp/lucene/sudachi/ja/system_core.dic
15-
bin/
15+
bin/
16+
17+
# IDE
18+
.metadata
19+
.project
20+
.classpath
21+
.checkstyle
22+
.factorypath
23+
.settings/
24+
*.iml
25+
.idea

README.md

Lines changed: 16 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,13 @@ analysis-sudachi is an Elasticsearch plugin for tokenization of Japanese text us
77

88
# What's new?
99

10+
- version 1.3.0
11+
- Upgrade sudachi morphological analyzer to 0.2.0
12+
- Import sudachi from maven central repository
13+
- Minor bug fix
14+
1015
- version 1.2.0
11-
- Upgrading sudachi morphological analyzer to 1.2.0-SNAPSHOT
16+
- Upgrading sudachi morphological analyzer to 0.2.0-SNAPSHOT
1217
- New filter `sudachi_normalizedform` was added; see [sudachi_normalizedform](#sudachi_normalizedform)
1318
- Default normalization behavior was changed; neather baseform filter and normalziedform filter not applied
1419
- `sudachi_readingform` filter was changed with new romaji mappings based on MS-IME
@@ -17,6 +22,9 @@ analysis-sudachi is an Elasticsearch plugin for tokenization of Japanese text us
1722
- version 1.1.0
1823
- `part-of-speech forward matching` is available on `stoptags`; see [sudachi_part_of_speech](#sudachi_part_of_speech)
1924

25+
- version 1.0.0
26+
- first release
27+
2028
# Build
2129

2230
1. Build analysis-sudachi.
@@ -26,12 +34,12 @@ analysis-sudachi is an Elasticsearch plugin for tokenization of Japanese text us
2634

2735
# Installation
2836

29-
Follow the steps below to install.
30-
31-
1. Change the current directory "/usr/share/elasticsearch".
32-
2. Place the zip file created with "Build" on the moved directory.
33-
3. Command "sudo bin/elasticsearch-plugin install file:///usr/share/elasticsearch/<zipfile-name>"
34-
4. Place files [system_core.dic or system_full.dic] under ES_HOME/sudachi.
37+
1. Download analysis-sudachi-elasticsearch zip archive file
38+
2. Move current dir to $ES_HOME
39+
3. Execute "bin/elasticsearch-plugin install file:///<plugin-zip-path>"
40+
4. Download sudachi dictionary archive from https://github.com/WorksApplications/SudachiDict
41+
5. Extract dic file and place it to config/sudachi_tokenizer/system_core.dic
42+
6. Execute "bin/elasticsearch"
3543

3644
# Configuration
3745

@@ -425,25 +433,8 @@ Returns `スシ`.
425433
```
426434
Returns `susi`.
427435

428-
# Releases
429-
430-
**1.1.0**
431-
- POS Filter: Allow forward matching; https://github.com/WorksApplications/elasticsearch-sudachi/issues/21
432-
433-
**1.0.3**
434-
- Elasticsearch 6.1 API migration
435-
436-
**1.0.2**
437-
- Elasticsearch 6.0 API migration
438-
439-
**1.0.1**
440-
- fix exception over 512 characters
441-
442-
**1.0.0**
443-
- first release
444-
445436
# License
446437

447-
Copyright (c) 2017 Works Applications Co., Ltd.
438+
Copyright (c) 2017-2019 Works Applications Co., Ltd.
448439
Originally under elasticsearch, https://www.elastic.co/jp/products/elasticsearch
449440
Originally under lucene, https://lucene.apache.org/

pom.xml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
<groupId>com.worksap.nlp</groupId>
66
<artifactId>analysis-sudachi-elasticsearch6.4</artifactId>
7-
<version>1.2.0-SNAPSHOT</version>
7+
<version>1.3.0</version>
88
<packaging>jar</packaging>
99

1010
<name>analysis-sudachi</name>
@@ -14,7 +14,7 @@
1414
<java.version>1.8</java.version>
1515
<elasticsearch.version>6.4.3</elasticsearch.version>
1616
<lucene.version>7.4.0</lucene.version>
17-
<sudachi.version>0.1.2-SNAPSHOT</sudachi.version>
17+
<sudachi.version>0.2.0</sudachi.version>
1818
<jacoco.skip>true</jacoco.skip>
1919
<sonar.skip>true</sonar.skip>
2020
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
@@ -106,7 +106,7 @@
106106
</plugin>
107107
<plugin>
108108
<artifactId>maven-surefire-plugin</artifactId>
109-
<version>2.20.1</version>
109+
<version>2.22.1</version>
110110
<configuration>
111111
<excludes>
112112
<exclude>**/TestAnalysisSudachi.java</exclude>
@@ -117,6 +117,9 @@
117117
<groupId>org.apache.maven.plugins</groupId>
118118
<artifactId>maven-javadoc-plugin</artifactId>
119119
<version>3.0.0-M1</version>
120+
<configuration>
121+
<source>${java.version}</source>
122+
</configuration>
120123
<executions>
121124
<execution>
122125
<id>attach-javadocs</id>
@@ -152,7 +155,7 @@
152155
</plugins>
153156
</build>
154157
<dependencies>
155-
<!-- Sudachi -->
158+
<!-- https://mvnrepository.com/artifact/com.worksap.nlp/sudachi -->
156159
<dependency>
157160
<groupId>com.worksap.nlp</groupId>
158161
<artifactId>sudachi</artifactId>

src/main/java/com/worksap/nlp/lucene/sudachi/ja/PartOfSpeechTrie.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
public class PartOfSpeechTrie {
2424

2525
static final String EMPTY_SYMBOL = "*";
26+
static final String LEAF = "";
2627

2728
Map<String, Object> root = new HashMap<>();
2829

@@ -37,6 +38,7 @@ public void add(String... items) {
3738
(Map<String, Object>)node.computeIfAbsent(item, k -> new HashMap<>());
3839
node = newNode;
3940
}
41+
node.put(LEAF, LEAF);
4042
}
4143

4244
public boolean isPrefixOf(List<String> items, int begin, int end) {
@@ -47,17 +49,17 @@ public boolean isPrefixOf(List<String> items, int begin, int end) {
4749
for (int i = begin; i < end; i++) {
4850
String item = items.get(i);
4951
if (EMPTY_SYMBOL.equals(item)) {
50-
return node.isEmpty();
52+
return node.containsKey(LEAF);
5153
}
5254
@SuppressWarnings("unchecked")
5355
Map<String, Object> newNode = (Map<String, Object>)node.get(item);
5456
node = newNode;
5557
if (node == null) {
5658
return false;
57-
} else if (node.isEmpty()) {
59+
} else if (node.containsKey(LEAF)) {
5860
return true;
5961
}
6062
}
61-
return true;
63+
return node.containsKey(LEAF);
6264
}
6365
}

src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,9 @@ String readSentences() throws IOException {
227227
if (n < 0) {
228228
if (remainSize != 0) {
229229
String lastSentence = new String(buffer, 0, remainSize);
230-
remainSize = 0;
231230
baseOffset = nextBaseOffset;
231+
nextBaseOffset += remainSize;
232+
remainSize = 0;
232233
return lastSentence;
233234
}
234235
return null;

src/test/java/com/worksap/nlp/lucene/sudachi/ja/ResourceUtil.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.io.IOException;
2222
import java.io.InputStream;
2323
import java.io.InputStreamReader;
24+
import java.nio.charset.StandardCharsets;
2425
import java.nio.file.Files;
2526

2627
class ResourceUtil {
@@ -30,8 +31,8 @@ private ResourceUtil() {}
3031

3132
static String getSudachiSetting(InputStream is) throws IOException{
3233
String settings;
33-
try (BufferedReader br = new BufferedReader(
34-
new InputStreamReader(is));) {
34+
try (InputStreamReader ir = new InputStreamReader(is, StandardCharsets.UTF_8);
35+
BufferedReader br = new BufferedReader(ir)) {
3536
String sudachiSettingsLine = "";
3637
StringBuilder sb = new StringBuilder();
3738
while ((sudachiSettingsLine = br.readLine()) != null) {

src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,21 @@ public void testConjugationForm() throws IOException {
9191
assertTokenStreamContents(tokenStream,
9292
new String[] {"東京都", "東京", "都", "に", "行っ"});
9393
}
94+
95+
public void testPrefixWithUnmatchedSubcategory() throws IOException {
96+
String tags = "助詞,格助詞\n助詞,格助詞,引用\n";
97+
factory.inform(new StringResourceLoader(tags));
98+
tokenStream = factory.create(tokenStream);
99+
assertTokenStreamContents(tokenStream,
100+
new String[] {"東京都", "東京", "都", "行っ", "た"});
101+
}
102+
103+
public void testTooLongCategory() throws IOException {
104+
String tags = "名詞,固有名詞,地名,一般,一般\n";
105+
factory.inform(new StringResourceLoader(tags));
106+
tokenStream = factory.create(tokenStream);
107+
assertTokenStreamContents(tokenStream,
108+
new String[] {"東京都", "東京", "都", "に", "行っ", "た"});
109+
}
110+
94111
}

0 commit comments

Comments
 (0)