Skip to content

Commit 325402d

Browse files
author
Kazuma TAKAOKA
committed
Merge branch 'develop' into release/5.6
1 parent 3a10a7b commit 325402d

File tree

10 files changed

+310
-56
lines changed

10 files changed

+310
-56
lines changed

pom.xml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,25 @@
33

44
<groupId>com.worksap.nlp</groupId>
55
<artifactId>analysis-sudachi-elasticsearch5.6</artifactId>
6-
<version>1.3.1-SNAPSHOT</version>
6+
<version>1.3.1</version>
77
<packaging>jar</packaging>
88

99
<name>analysis-sudachi</name>
1010

1111
<properties>
1212
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
1313
<java.version>1.8</java.version>
14-
<elasticsearch.version>5.6.16</elasticsearch.version>
15-
<lucene.version>6.6.3</lucene.version>
16-
<sudachi.version>0.2.0</sudachi.version>
17-
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
18-
<sonar.language>java</sonar.language>
19-
<sonar.organization>worksapplications</sonar.organization>
20-
<sonar.links.homepage>https://github.com/WorksApplications/elasticsearch-sudachi</sonar.links.homepage>
21-
<sonar.links.ci>https://travis-ci.org/WorksApplications/elasticsearch-sudachi</sonar.links.ci>
22-
<sonar.links.issue>https://github.com/WorksApplications/elasticsearch-sudachi/issues</sonar.links.issue>
23-
<sonar.junit.reportsPath/>
24-
<sonar.junit.reportPaths>${project.build.directory}/surefire-reports</sonar.junit.reportPaths>
14+
<elasticsearch.version>5.6.16</elasticsearch.version>
15+
<lucene.version>6.6.3</lucene.version>
16+
<sudachi.version>0.3.0</sudachi.version>
17+
<sonar.host.url>https://sonarcloud.io</sonar.host.url>
18+
<sonar.language>java</sonar.language>
19+
<sonar.organization>worksapplications</sonar.organization>
20+
<sonar.links.homepage>https://github.com/WorksApplications/elasticsearch-sudachi</sonar.links.homepage>
21+
<sonar.links.ci>https://travis-ci.org/WorksApplications/elasticsearch-sudachi</sonar.links.ci>
22+
<sonar.links.issue>https://github.com/WorksApplications/elasticsearch-sudachi/issues</sonar.links.issue>
23+
<sonar.junit.reportsPath />
24+
<sonar.junit.reportPaths>${project.build.directory}/surefire-reports</sonar.junit.reportPaths>
2525
</properties>
2626

2727
<build>
@@ -153,4 +153,4 @@
153153
<developerConnection>scm:git:[email protected]:WorksApplications/elasticsearch-sudachi.git</developerConnection>
154154
<url>https://github.com/WorksApplications/elasticsearch-sudachi</url>
155155
</scm>
156-
</project>
156+
</project>

src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -223,18 +223,20 @@ String readSentences() throws IOException {
223223
offset = remainSize;
224224
length -= remainSize;
225225
}
226-
int n = input.read(buffer, offset, length);
227-
if (n < 0) {
228-
if (remainSize != 0) {
229-
String lastSentence = new String(buffer, 0, remainSize);
230-
baseOffset = nextBaseOffset;
231-
nextBaseOffset += remainSize;
232-
remainSize = 0;
233-
return lastSentence;
226+
227+
while (length != 0) {
228+
int ret = input.read(buffer, offset, length);
229+
if (ret < 0) {
230+
break;
234231
}
232+
offset += ret;
233+
length -= ret;
234+
}
235+
int n = offset;
236+
237+
if (n == 0) {
235238
return null;
236239
}
237-
n += offset;
238240

239241
int eos = lastIndexOfEos(buffer, n);
240242
if (eos == n && Character.isHighSurrogate(buffer[n - 1])) {

src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Romanizer.java

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,6 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
6868
case 'ト':
6969
builder.append('t');
7070
break main;
71-
case 'ナ':
72-
case 'ニ':
73-
case 'ヌ':
74-
case 'ネ':
75-
case 'ノ':
76-
builder.append('n');
77-
break main;
7871
case 'ハ':
7972
case 'ヒ':
8073
case 'フ':
@@ -132,6 +125,9 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
132125
case 'ポ':
133126
builder.append('p');
134127
break main;
128+
case 'ヴ':
129+
builder.append('v');
130+
break main;
135131
default:
136132
builder.append("ltu");
137133
}
@@ -337,10 +333,6 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
337333
builder.append("tsi");
338334
i++;
339335
break;
340-
case 'ゥ':
341-
builder.append("tsu");
342-
i++;
343-
break;
344336
case 'ェ':
345337
builder.append("tse");
346338
i++;
@@ -512,7 +504,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
512504
i++;
513505
break;
514506
default:
515-
builder.append("ho");
507+
builder.append("hu");
516508
break;
517509
}
518510
break;
@@ -679,7 +671,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
679671
i++;
680672
break;
681673
case 'ゥ':
682-
builder.append("qwu");
674+
builder.append("gwu");
683675
i++;
684676
break;
685677
case 'ェ':
@@ -771,7 +763,7 @@ public static void getRomanization(Appendable builder, CharSequence s) throws IO
771763
}
772764
break;
773765
case 'ヅ':
774-
builder.append("zu");
766+
builder.append("du");
775767
break;
776768
case 'デ':
777769
switch(ch2) {

src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiAnalyzer.java

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,10 @@
2626
import java.util.ArrayList;
2727
import java.util.List;
2828

29-
import org.apache.lucene.document.Document;
3029
import org.apache.lucene.document.Field;
30+
import org.apache.lucene.document.Document;
3131
import org.apache.lucene.document.TextField;
32-
import org.apache.lucene.index.DirectoryReader;
33-
import org.apache.lucene.index.FieldInfos;
34-
import org.apache.lucene.index.IndexReader;
35-
import org.apache.lucene.index.IndexWriter;
36-
import org.apache.lucene.index.IndexWriterConfig;
37-
import org.apache.lucene.index.LeafReader;
38-
import org.apache.lucene.index.LeafReaderContext;
39-
import org.apache.lucene.index.Terms;
40-
import org.apache.lucene.index.TermsEnum;
32+
import org.apache.lucene.index.*;
4133
import org.apache.lucene.queryparser.classic.QueryParser;
4234
import org.apache.lucene.search.IndexSearcher;
4335
import org.apache.lucene.search.Query;

src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import java.io.InputStream;
2222
import java.io.StringReader;
2323
import java.util.HashMap;
24-
import java.util.Map;
2524

2625
import org.junit.Rule;
2726
import org.junit.rules.TemporaryFolder;

src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilterFactory.java

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import java.io.InputStream;
2222
import java.io.StringReader;
2323
import java.util.HashMap;
24-
import java.util.Map;
2524

2625
import org.junit.Rule;
2726
import org.junit.rules.TemporaryFolder;
@@ -52,14 +51,12 @@ public void setUp() throws Exception {
5251

5352
public void testBasics() throws IOException {
5453
String tags = "動詞,非自立可能\n";
55-
TokenStream ts = new SudachiTokenizer(true, SudachiTokenizer.Mode.NORMAL, path, settings);
56-
((Tokenizer)ts).setReader(new StringReader("東京都に行った。"));
57-
Map<String, String> args = new HashMap<>();
58-
args.put("tags", "stoptags.txt");
54+
Tokenizer tokenizer = new SudachiTokenizer(true, SudachiTokenizer.Mode.NORMAL, path, settings);
55+
tokenizer.setReader(new StringReader("東京都に行った。"));
5956
SudachiPartOfSpeechStopFilterFactory factory
60-
= new SudachiPartOfSpeechStopFilterFactory(args);
57+
= new SudachiPartOfSpeechStopFilterFactory(new HashMap<String, String>() {{ put("tags", "stoptags.txt"); }});
6158
factory.inform(new StringResourceLoader(tags));
62-
ts = factory.create(ts);
59+
TokenStream ts = factory.create(tokenizer);
6360
assertTokenStreamContents(ts,
6461
new String[] {"東京都", "に", "た"});
6562
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright (c) 2019 Works Applications Co., Ltd.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.worksap.nlp.lucene.sudachi.ja;
18+
19+
import java.io.File;
20+
import java.io.IOException;
21+
import java.io.InputStream;
22+
import java.io.StringReader;
23+
import java.util.Collections;
24+
import java.util.HashMap;
25+
26+
import org.junit.Rule;
27+
import org.junit.rules.TemporaryFolder;
28+
29+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
30+
import org.apache.lucene.analysis.TokenStream;
31+
import org.apache.lucene.analysis.Tokenizer;
32+
33+
public class TestSudachiReadingFormFilter extends BaseTokenStreamTestCase {
34+
TokenStream tokenStream;
35+
36+
@Rule
37+
public TemporaryFolder tempFolderForDictionary = new TemporaryFolder();
38+
39+
public void setUp() throws Exception {
40+
super.setUp();
41+
tempFolderForDictionary.create();
42+
File tempFileForDictionary = tempFolderForDictionary
43+
.newFolder("sudachiDictionary");
44+
ResourceUtil.copy(tempFileForDictionary);
45+
46+
String settings;
47+
try (InputStream is = this.getClass().getResourceAsStream("sudachi.json")) {
48+
settings = ResourceUtil.getSudachiSetting(is);
49+
}
50+
51+
tokenStream = new SudachiTokenizer(true, SudachiTokenizer.Mode.SEARCH, tempFileForDictionary.getPath(), settings);
52+
}
53+
54+
public void testReadingForm() throws IOException {
55+
SudachiReadingFormFilterFactory factory = new SudachiReadingFormFilterFactory(Collections.emptyMap());
56+
((Tokenizer)tokenStream).setReader(new StringReader("東京都に行った。"));
57+
tokenStream = factory.create(tokenStream);
58+
assertTokenStreamContents(tokenStream, new String[] {"トウキョウト", "トウキョウ", "ト", "ニ", "イッ", "タ"});
59+
}
60+
61+
public void testRomanizedReadingForm() throws IOException {
62+
SudachiReadingFormFilterFactory factory = new SudachiReadingFormFilterFactory(new HashMap<String, String>() {{ put("useRomaji", "true"); }});
63+
((Tokenizer)tokenStream).setReader(new StringReader("東京都に行った。"));
64+
tokenStream = factory.create(tokenStream);
65+
assertTokenStreamContents(tokenStream, new String[] {"toukyouto", "toukyou", "to", "ni", "iltu", "ta"});
66+
}
67+
}

src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.java

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@
1818
package com.worksap.nlp.lucene.sudachi.ja;
1919

2020
import static org.hamcrest.CoreMatchers.is;
21-
import static org.junit.Assert.assertThat;
2221

2322
import java.io.File;
2423
import java.io.IOException;
2524
import java.io.InputStream;
25+
import java.io.Reader;
2626
import java.io.StringReader;
2727

2828
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -362,4 +362,46 @@ public void testReadSentencesWithSurrogatePair() throws IOException {
362362
}
363363
}
364364

365+
private static class ChunkedStringReader extends Reader {
366+
private char[] in;
367+
private int chunkSize;
368+
private int pos;
369+
public ChunkedStringReader(String in, int chunkSize) {
370+
this.in = in.toCharArray();
371+
this.chunkSize = chunkSize;
372+
this.pos = 0;
373+
}
374+
375+
@Override
376+
public void close() throws IOException {
377+
this.pos = this.in.length;
378+
}
379+
380+
@Override
381+
public int read(char[] cbuf, int off, int len) throws IOException {
382+
int length = len < this.chunkSize ? len : this.chunkSize;
383+
if (length > this.in.length - this.pos) {
384+
length = this.in.length - this.pos;
385+
}
386+
if (length == 0) {
387+
return -1;
388+
}
389+
System.arraycopy(this.in, this.pos, cbuf, off, length);
390+
this.pos += length;
391+
return length;
392+
}
393+
}
394+
395+
@Test
396+
public void testReadSentencesFromChunkedCharFilter() throws IOException {
397+
String inputString = "Elasticsearch";
398+
Reader charFilter = new ChunkedStringReader(inputString, 5);
399+
tokenizer.setReader(charFilter);
400+
tokenizer.reset();
401+
String[] answerList = { "Elasticsearch" };
402+
for (int i = 0; i < answerList.length; i++) {
403+
assertThat(tokenizer.readSentences(), is(answerList[i]));
404+
}
405+
}
406+
365407
}

0 commit comments

Comments
 (0)