Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ New Features

* GITHUB#14792: Introduced OffHeapQuantizedFloatVectorValues class to access float vectors when only quantized byte vectors are available in the index. (Pulkit Gupta)

* GITHUB#14969: Add metadata support to Nori Korean analyzer tokens, allowing users to attach additional information to dictionary words. (twosom)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
/** A token that was generated from a compound. */
public class DecompoundToken extends Token {
private final POS.Tag posTag;
private final String metadata;

/**
* Creates a new DecompoundToken
Expand All @@ -31,11 +32,18 @@ public class DecompoundToken extends Token {
* @param startOffset The start offset of the token in the analyzed text.
* @param endOffset The end offset of the token in the analyzed text.
* @param type The type of this token.
* @param metadata The metadata of this token.
*/
public DecompoundToken(
POS.Tag posTag, String surfaceForm, int startOffset, int endOffset, TokenType type) {
POS.Tag posTag,
String surfaceForm,
int startOffset,
int endOffset,
TokenType type,
String metadata) {
super(surfaceForm.toCharArray(), 0, surfaceForm.length(), startOffset, endOffset, type);
this.posTag = posTag;
this.metadata = metadata;
}

@Override
Expand Down Expand Up @@ -77,4 +85,9 @@ public String getReading() {
public KoMorphData.Morpheme[] getMorphemes() {
return null;
}

@Override
public String getMetadata() {
return metadata;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
public class DictionaryToken extends Token {
private final int wordId;
private final KoMorphData morphAtts;
private String metadata = null;

public DictionaryToken(
TokenType type,
Expand Down Expand Up @@ -108,4 +109,21 @@ public String getReading() {
public KoMorphData.Morpheme[] getMorphemes() {
return morphAtts.getMorphemes(wordId, getSurfaceForm(), getOffset(), getLength());
}

@Override
public String getMetadata() {
return this.metadata;
}

public void setMetadata(String metadata) {
this.metadata = metadata;
}

public int getWordId() {
return wordId;
}

public KoMorphData getMorphAtts() {
return morphAtts;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.lucene.analysis.ko.dict.TokenInfoFST;
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
import org.apache.lucene.analysis.ko.dict.UserDictionary;
import org.apache.lucene.analysis.ko.tokenattributes.MetadataAttribute;
import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.morph.GraphvizFormatter;
Expand Down Expand Up @@ -77,6 +78,7 @@ public enum DecompoundMode {
private final Viterbi viterbi;

private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final MetadataAttribute metadataAtt = addAttribute(MetadataAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt =
addAttribute(PositionIncrementAttribute.class);
Expand Down Expand Up @@ -233,6 +235,7 @@ public boolean incrementToken() throws IOException {
// System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
// token.getSurfaceForm().length);
termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
metadataAtt.setToken(token);
offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset()));
posAtt.setToken(token);
readingAtt.setToken(token);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,6 @@ protected Token(
* token.
*/
public abstract KoMorphData.Morpheme[] getMorphemes();

public abstract String getMetadata();
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
import org.apache.lucene.analysis.ko.dict.UserDictionary;
import org.apache.lucene.analysis.ko.dict.UserMorphData;
import org.apache.lucene.analysis.morph.ConnectionCosts;
import org.apache.lucene.analysis.morph.Dictionary;
import org.apache.lucene.analysis.morph.GraphvizFormatter;
Expand Down Expand Up @@ -248,6 +249,10 @@ protected void backtrace(Position endPosData, int fromIDX) {
if (token.getPOSType() == POS.Type.MORPHEME
|| mode == KoreanTokenizer.DecompoundMode.NONE) {
if (shouldFilterToken(token) == false) {
if (token.getMorphAtts() instanceof UserMorphData userMorphData) {
final String metadata = userMorphData.metadatas[token.getWordId()];
token.setMetadata(metadata);
}
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
Expand All @@ -264,9 +269,11 @@ protected void backtrace(Position endPosData, int fromIDX) {
int endOffset = backWordPos + length;
int posLen = 0;
// decompose the compound
String metadata = null;
for (int i = morphemes.length - 1; i >= 0; i--) {
final KoMorphData.Morpheme morpheme = morphemes[i];
final Token compoundToken;
metadata = morpheme.metadata();
if (token.getPOSType() == POS.Type.COMPOUND) {
assert endOffset - morpheme.surfaceForm().length() >= 0;
compoundToken =
Expand All @@ -275,15 +282,17 @@ protected void backtrace(Position endPosData, int fromIDX) {
morpheme.surfaceForm(),
endOffset - morpheme.surfaceForm().length(),
endOffset,
backType);
backType,
metadata);
} else {
compoundToken =
new DecompoundToken(
morpheme.posTag(),
morpheme.surfaceForm(),
token.getStartOffset(),
token.getEndOffset(),
backType);
backType,
metadata);
}
if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
compoundToken.setPositionIncrement(0);
Expand All @@ -297,6 +306,7 @@ protected void backtrace(Position endPosData, int fromIDX) {
}
if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
token.setPositionLength(Math.max(1, posLen));
token.setMetadata(metadata);
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@
/** Represents Korean morphological information. */
public interface KoMorphData extends MorphData {
/** A morpheme extracted from a compound token. */
record Morpheme(POS.Tag posTag, String surfaceForm) {}
record Morpheme(POS.Tag posTag, String surfaceForm, String metadata) {
public Morpheme(POS.Tag posTag, String surfaceForm) {
this(posTag, surfaceForm, null);
}
}

/**
* Get the {@link org.apache.lucene.analysis.ko.POS.Type} of specified word (morpheme, compound,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
* (세종시 세종 시).
*/
public final class UserDictionary implements Dictionary<UserMorphData> {
public static final String METADATA_SEPARATOR = " >> ";
// text -> wordID
private final TokenInfoFST fst;

Expand Down Expand Up @@ -82,10 +83,19 @@ private UserDictionary(List<String> entries) throws IOException {

String lastToken = null;
List<int[]> _segmentations = new ArrayList<>(entries.size());
List<String> _metadatas = new ArrayList<>(entries.size());
short[] rightIds = new short[entries.size()];
long ord = 0;
int entryIndex = 0;
for (String entry : entries) {
if (entry.contains(METADATA_SEPARATOR)) {
var split = entry.split(METADATA_SEPARATOR);
entry = split[0];
var metadata = split[1];
_metadatas.add(metadata);
} else {
_metadatas.add(null);
}
String[] splits = entry.split("\\s+");
String token = splits[0];
if (token.equals(lastToken)) {
Expand Down Expand Up @@ -138,7 +148,8 @@ private UserDictionary(List<String> entries) throws IOException {
this.fst =
new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]);
this.morphAtts = new UserMorphData(segmentations, rightIds);
String[] metadatas = _metadatas.toArray(String[]::new);
this.morphAtts = new UserMorphData(segmentations, rightIds, metadatas);
}

public TokenInfoFST getFST() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import org.apache.lucene.analysis.ko.POS;

/** Morphological information for user dictionary. */
final class UserMorphData implements KoMorphData {
public final class UserMorphData implements KoMorphData {
private static final int WORD_COST = -100000;

// NNG left
Expand All @@ -28,12 +28,18 @@ final class UserMorphData implements KoMorphData {
// length, length... indexed by compound ID or null for simple noun
private final int[][] segmentations;
private final short[] rightIds;
public String[] metadatas;

UserMorphData(int[][] segmentations, short[] rightIds) {
this.segmentations = segmentations;
this.rightIds = rightIds;
}

UserMorphData(int[][] segmentations, short[] rightIds, String[] metadatas) {
this(segmentations, rightIds);
this.metadatas = metadatas;
}

@Override
public int getLeftId(int morphId) {
return LEFT_ID;
Expand Down Expand Up @@ -79,10 +85,12 @@ public Morpheme[] getMorphemes(int morphId, char[] surfaceForm, int off, int len
if (segs == null) {
return null;
}
String metadata = metadatas[morphId];
int offset = 0;
Morpheme[] morphemes = new Morpheme[segs.length];
for (int i = 0; i < segs.length; i++) {
morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]));
morphemes[i] =
new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]), metadata);
offset += segs[i];
}
return morphemes;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.tokenattributes;

import org.apache.lucene.analysis.ko.Token;
import org.apache.lucene.util.Attribute;

/**
* Attribute for Korean token metadata.
*
* <p>This attribute provides access to additional metadata associated with Korean tokens,
* particularly from user dictionaries and compound word morphemes.
*
* <p>Note: in some cases this value may not be applicable, and will be null.
*
* @lucene.experimental
*/
public interface MetadataAttribute extends Attribute {
/** Get the metadata string of the token. */
String getMetadata();

/** Set the current token. */
void setToken(Token token);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.tokenattributes;

import org.apache.lucene.analysis.ko.Token;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;

/**
* Attribute implementation for Korean token metadata.
*
* @lucene.experimental
*/
public class MetadataAttributeImpl extends AttributeImpl implements MetadataAttribute {
private Token token;

@Override
public String getMetadata() {
if (this.token != null) {
return this.token.getMetadata();
}
return null;
}

@Override
public void setToken(Token token) {
this.token = token;
}

@Override
public void clear() {
this.token = null;
}

@Override
public void copyTo(AttributeImpl target) {
final MetadataAttribute t = (MetadataAttribute) target;
t.setToken(this.token);
}

@Override
public void reflectWith(AttributeReflector reflector) {
reflector.reflect(MetadataAttribute.class, "metadata", getMetadata());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
import org.apache.lucene.analysis.ko.dict.UserDictionary;
import org.apache.lucene.analysis.ko.tokenattributes.MetadataAttribute;
import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
Expand Down Expand Up @@ -576,6 +577,27 @@ public void testDuplicate() throws IOException {
}
}

public void testMetadataAttribute() throws IOException {
assertMetadata(analyzer, "자바", "컴퓨터 언어");
assertMetadata(analyzer, "java", "컴퓨터 언어");
assertMetadata(analyzer, "엘라스틱서치", "검색 엔진");

assertMetadata(analyzerDecompoundKeep, "엘라스틱서치", "검색 엔진");
}

private void assertMetadata(Analyzer analyzer, String input, String metadata) throws IOException {
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
final MetadataAttribute metadataAtt = ts.addAttribute(MetadataAttribute.class);
ts.reset();
while (ts.incrementToken()) {
assertNotNull(metadataAtt.getMetadata());
assertEquals(metadata, metadataAtt.getMetadata());
}
assertFalse(ts.incrementToken());
ts.end();
}
}

private void assertReadings(Analyzer analyzer, String input, String... readings)
throws IOException {
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
Expand Down
Loading
Loading