release 6.4.3-1.3.0

liu-to · liu-to · commit 19314074052a · 2019-06-26T18:59:23.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,14 @@ buildNumber.properties
 !/.mvn/wrapper/maven-wrapper.jar
 
 src/test/resources/com/worksap/nlp/lucene/sudachi/ja/system_core.dic
-bin/
+bin/
+
+# IDE
+.metadata
+.project
+.classpath
+.checkstyle
+.factorypath
+.settings/
+*.iml
+.idea
diff --git a/README.md b/README.md
@@ -7,8 +7,13 @@ analysis-sudachi is an Elasticsearch plugin for tokenization of Japanese text us
 
 # What's new?
 
+- version 1.3.0
+    - Upgrade sudachi morphological analyzer to 0.2.0
+    - Import sudachi from maven central repository
+    - Minor bug fix
+
 - version 1.2.0
-    - Upgrading sudachi morphological analyzer to 1.2.0-SNAPSHOT
+    - Upgrading sudachi morphological analyzer to 0.2.0-SNAPSHOT
     - New filter `sudachi_normalizedform` was added; see [sudachi_normalizedform](#sudachi_normalizedform)
     - Default normalization behavior was changed; neather baseform filter and normalziedform filter not applied
     - `sudachi_readingform` filter was changed with new romaji mappings based on MS-IME
@@ -17,6 +22,9 @@ analysis-sudachi is an Elasticsearch plugin for tokenization of Japanese text us
 - version 1.1.0
     - `part-of-speech forward matching` is available on `stoptags`; see [sudachi_part_of_speech](#sudachi_part_of_speech)
 
+- version 1.0.0
+    - first release
+
 # Build
 
 1. Build analysis-sudachi.
@@ -26,12 +34,12 @@ analysis-sudachi is an Elasticsearch plugin for tokenization of Japanese text us
 
 # Installation
 
-Follow the steps below to install.
-
-1. Change the current directory "/usr/share/elasticsearch".
-2. Place the zip file created with "Build" on the moved directory.
-3. Command "sudo bin/elasticsearch-plugin install file:///usr/share/elasticsearch/<zipfile-name>"
-4. Place files [system_core.dic or system_full.dic] under ES_HOME/sudachi.
+1. Download analysis-sudachi-elasticsearch zip archive file
+2. Move current dir to $ES_HOME
+3. Execute "bin/elasticsearch-plugin install file:///<plugin-zip-path>"
+4. Download sudachi dictionary archive from https://github.com/WorksApplications/SudachiDict
+5. Extract dic file and place it to config/sudachi_tokenizer/system_core.dic
+6. Execute "bin/elasticsearch"
 
 # Configuration
 
@@ -425,25 +433,8 @@ Returns `スシ`.
 ```
 Returns `susi`.
 
-# Releases
-
-**1.1.0**
-- POS Filter: Allow forward matching; https://github.com/WorksApplications/elasticsearch-sudachi/issues/21
-
-**1.0.3**
-- Elasticsearch 6.1 API migration
-
-**1.0.2**
-- Elasticsearch 6.0 API migration
-
-**1.0.1**
-- fix exception over 512 characters
-
-**1.0.0**
-- first release
-
 # License
 
-Copyright (c) 2017 Works Applications Co., Ltd.
+Copyright (c) 2017-2019 Works Applications Co., Ltd.
 Originally under elasticsearch, https://www.elastic.co/jp/products/elasticsearch
 Originally under lucene, https://lucene.apache.org/
diff --git a/pom.xml b/pom.xml
@@ -4,7 +4,7 @@
 
 	<groupId>com.worksap.nlp</groupId>
     <artifactId>analysis-sudachi-elasticsearch6.4</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0</version>
 	<packaging>jar</packaging>
 
 	<name>analysis-sudachi</name>
@@ -14,7 +14,7 @@
 		<java.version>1.8</java.version>
 		<elasticsearch.version>6.4.3</elasticsearch.version>
 		<lucene.version>7.4.0</lucene.version>
-		<sudachi.version>0.1.2-SNAPSHOT</sudachi.version>
+		<sudachi.version>0.2.0</sudachi.version>
 		<jacoco.skip>true</jacoco.skip>
 		<sonar.skip>true</sonar.skip>
 		<sonar.host.url>https://sonarcloud.io</sonar.host.url>
@@ -106,7 +106,7 @@
 			</plugin>
 			<plugin>
 				<artifactId>maven-surefire-plugin</artifactId>
-				<version>2.20.1</version>
+				<version>2.22.1</version>
 				<configuration>
 					<excludes>
 						<exclude>**/TestAnalysisSudachi.java</exclude>
@@ -117,6 +117,9 @@
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-javadoc-plugin</artifactId>
 				<version>3.0.0-M1</version>
+				<configuration>
+					<source>${java.version}</source>
+				</configuration>
 				<executions>
 					<execution>
 						<id>attach-javadocs</id>
@@ -152,7 +155,7 @@
 		</plugins>
 	</build>
 	<dependencies>
-		<!-- Sudachi -->
+		<!-- https://mvnrepository.com/artifact/com.worksap.nlp/sudachi -->
 		<dependency>
 			<groupId>com.worksap.nlp</groupId>
 			<artifactId>sudachi</artifactId>
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/PartOfSpeechTrie.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/PartOfSpeechTrie.java
@@ -23,6 +23,7 @@
 public class PartOfSpeechTrie {
 
     static final String EMPTY_SYMBOL = "*";
+    static final String LEAF = "";
 
     Map<String, Object> root = new HashMap<>();
 
@@ -37,6 +38,7 @@ public void add(String... items) {
                 (Map<String, Object>)node.computeIfAbsent(item, k -> new HashMap<>());
             node = newNode;
         }
+        node.put(LEAF, LEAF);
     }
 
     public boolean isPrefixOf(List<String> items, int begin, int end) {
@@ -47,17 +49,17 @@ public boolean isPrefixOf(List<String> items, int begin, int end) {
         for (int i = begin; i < end; i++) {
             String item = items.get(i);
             if (EMPTY_SYMBOL.equals(item)) {
-                return node.isEmpty();
+                return node.containsKey(LEAF);
             }
             @SuppressWarnings("unchecked")
             Map<String, Object> newNode = (Map<String, Object>)node.get(item);
             node = newNode;
             if (node == null) {
                 return false;
-            } else if (node.isEmpty()) {
+            } else if (node.containsKey(LEAF)) {
                 return true;
             }
         }
-        return true;
+        return node.containsKey(LEAF);
     }
 }
diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java
@@ -227,8 +227,9 @@ String readSentences() throws IOException {
         if (n < 0) {
             if (remainSize != 0) {
                 String lastSentence = new String(buffer, 0, remainSize);
-                remainSize = 0;
                 baseOffset = nextBaseOffset;
+                nextBaseOffset += remainSize;
+                remainSize = 0;
                 return lastSentence;
             }
             return null;
diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/ResourceUtil.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/ResourceUtil.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 
 class ResourceUtil {
@@ -30,8 +31,8 @@ private ResourceUtil() {}
 
     static String getSudachiSetting(InputStream is) throws IOException{
         String settings;
-        try (BufferedReader br = new BufferedReader(
-                        new InputStreamReader(is));) {
+        try (InputStreamReader ir = new InputStreamReader(is, StandardCharsets.UTF_8);
+            BufferedReader br = new BufferedReader(ir)) {
             String sudachiSettingsLine = "";
             StringBuilder sb = new StringBuilder();
             while ((sudachiSettingsLine = br.readLine()) != null) {
diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java
@@ -91,4 +91,21 @@ public void testConjugationForm() throws IOException {
         assertTokenStreamContents(tokenStream,
                                   new String[] {"東京都", "東京", "都", "に", "行っ"});
     }
+
+    public void testPrefixWithUnmatchedSubcategory() throws IOException {
+        String tags = "助詞,格助詞\n助詞,格助詞,引用\n";
+        factory.inform(new StringResourceLoader(tags));
+        tokenStream = factory.create(tokenStream);
+        assertTokenStreamContents(tokenStream,
+                                  new String[] {"東京都", "東京", "都", "行っ", "た"});
+    }
+
+    public void testTooLongCategory() throws IOException {
+        String tags = "名詞,固有名詞,地名,一般,一般\n";
+        factory.inform(new StringResourceLoader(tags));
+        tokenStream = factory.create(tokenStream);
+        assertTokenStreamContents(tokenStream,
+                                  new String[] {"東京都", "東京", "都", "に", "行っ", "た"});
+    }
+
 }
diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@`
`23`	`23`	`public class PartOfSpeechTrie {`
`24`	`24`
`25`	`25`	`static final String EMPTY_SYMBOL = "*";`
	`26`	`+ static final String LEAF = "";`
`26`	`27`
`27`	`28`	`Map<String, Object> root = new HashMap<>();`
`28`	`29`
`@@ -37,6 +38,7 @@ public void add(String... items) {`
`37`	`38`	`(Map<String, Object>)node.computeIfAbsent(item, k -> new HashMap<>());`
`38`	`39`	`node = newNode;`
`39`	`40`	`}`
	`41`	`+ node.put(LEAF, LEAF);`
`40`	`42`	`}`
`41`	`43`
`42`	`44`	`public boolean isPrefixOf(List<String> items, int begin, int end) {`
`@@ -47,17 +49,17 @@ public boolean isPrefixOf(List<String> items, int begin, int end) {`
`47`	`49`	`for (int i = begin; i < end; i++) {`
`48`	`50`	`String item = items.get(i);`
`49`	`51`	`if (EMPTY_SYMBOL.equals(item)) {`
`50`		`- return node.isEmpty();`
	`52`	`+ return node.containsKey(LEAF);`
`51`	`53`	`}`
`52`	`54`	`@SuppressWarnings("unchecked")`
`53`	`55`	`Map<String, Object> newNode = (Map<String, Object>)node.get(item);`
`54`	`56`	`node = newNode;`
`55`	`57`	`if (node == null) {`
`56`	`58`	`return false;`
`57`		`- } else if (node.isEmpty()) {`
	`59`	`+ } else if (node.containsKey(LEAF)) {`
`58`	`60`	`return true;`
`59`	`61`	`}`
`60`	`62`	`}`
`61`		`- return true;`
	`63`	`+ return node.containsKey(LEAF);`
`62`	`64`	`}`
`63`	`65`	`}`