apache · twosom · Jul 20, 2025 · Jul 20, 2025 · Jul 20, 2025 · Jul 20, 2025
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -37,6 +37,8 @@ New Features
 
 * GITHUB#14792: Introduced OffHeapQuantizedFloatVectorValues class to access float vectors when only quantized byte vectors are available in the index. (Pulkit Gupta)
 
+* GITHUB#14969: Add metadata support to Nori Korean analyzer tokens, allowing users to attach additional information to dictionary words. (twosom)
+
 Improvements
 ---------------------
 

diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java
@@ -22,6 +22,7 @@
 /** A token that was generated from a compound. */
 public class DecompoundToken extends Token {
   private final POS.Tag posTag;
+  private final String metadata;
 
   /**
    * Creates a new DecompoundToken
@@ -31,11 +32,18 @@ public class DecompoundToken extends Token {
    * @param startOffset The start offset of the token in the analyzed text.
    * @param endOffset The end offset of the token in the analyzed text.
    * @param type The type of this token.
+   * @param metadata The metadata of this token.
    */
   public DecompoundToken(
-      POS.Tag posTag, String surfaceForm, int startOffset, int endOffset, TokenType type) {
+      POS.Tag posTag,
+      String surfaceForm,
+      int startOffset,
+      int endOffset,
+      TokenType type,
+      String metadata) {
     super(surfaceForm.toCharArray(), 0, surfaceForm.length(), startOffset, endOffset, type);
     this.posTag = posTag;
+    this.metadata = metadata;
   }
 
   @Override
@@ -77,4 +85,9 @@ public String getReading() {
   public KoMorphData.Morpheme[] getMorphemes() {
     return null;
   }
+
+  @Override
+  public String getMetadata() {
+    return metadata;
+  }
 }
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java
@@ -23,6 +23,7 @@
 public class DictionaryToken extends Token {
   private final int wordId;
   private final KoMorphData morphAtts;
+  private String metadata = null;
 
   public DictionaryToken(
       TokenType type,
@@ -108,4 +109,21 @@ public String getReading() {
   public KoMorphData.Morpheme[] getMorphemes() {
     return morphAtts.getMorphemes(wordId, getSurfaceForm(), getOffset(), getLength());
   }
+
+  @Override
+  public String getMetadata() {
+    return this.metadata;
+  }
+
+  public void setMetadata(String metadata) {
+    this.metadata = metadata;
+  }
+
+  public int getWordId() {
+    return wordId;
+  }
+
+  public KoMorphData getMorphAtts() {
+    return morphAtts;
+  }
 }
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@@ -25,6 +25,7 @@
 import org.apache.lucene.analysis.ko.dict.TokenInfoFST;
 import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
 import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.ko.tokenattributes.MetadataAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
 import org.apache.lucene.analysis.morph.GraphvizFormatter;
@@ -77,6 +78,7 @@ public enum DecompoundMode {
   private final Viterbi viterbi;
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final MetadataAttribute metadataAtt = addAttribute(MetadataAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncAtt =
       addAttribute(PositionIncrementAttribute.class);
@@ -233,6 +235,7 @@ public boolean incrementToken() throws IOException {
     // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
     // token.getSurfaceForm().length);
     termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
+    metadataAtt.setToken(token);
     offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset()));
     posAtt.setToken(token);
     readingAtt.setToken(token);

diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Token.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Token.java
@@ -44,4 +44,6 @@ protected Token(
    * token.
    */
   public abstract KoMorphData.Morpheme[] getMorphemes();
+
+  public abstract String getMetadata();
 }
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java
@@ -23,6 +23,7 @@
 import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
 import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
 import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.ko.dict.UserMorphData;
 import org.apache.lucene.analysis.morph.ConnectionCosts;
 import org.apache.lucene.analysis.morph.Dictionary;
 import org.apache.lucene.analysis.morph.GraphvizFormatter;
@@ -248,6 +249,10 @@ protected void backtrace(Position endPosData, int fromIDX) {
         if (token.getPOSType() == POS.Type.MORPHEME
             || mode == KoreanTokenizer.DecompoundMode.NONE) {
           if (shouldFilterToken(token) == false) {
+            if (token.getMorphAtts() instanceof UserMorphData userMorphData) {
+              final String metadata = userMorphData.metadatas[token.getWordId()];
+              token.setMetadata(metadata);
+            }
             pending.add(token);
             if (VERBOSE) {
               System.out.println("    add token=" + pending.get(pending.size() - 1));
@@ -264,9 +269,11 @@ protected void backtrace(Position endPosData, int fromIDX) {
             int endOffset = backWordPos + length;
             int posLen = 0;
             // decompose the compound
+            String metadata = null;
             for (int i = morphemes.length - 1; i >= 0; i--) {
               final KoMorphData.Morpheme morpheme = morphemes[i];
               final Token compoundToken;
+              metadata = morpheme.metadata();
               if (token.getPOSType() == POS.Type.COMPOUND) {
                 assert endOffset - morpheme.surfaceForm().length() >= 0;
                 compoundToken =
@@ -275,15 +282,17 @@ protected void backtrace(Position endPosData, int fromIDX) {
                         morpheme.surfaceForm(),
                         endOffset - morpheme.surfaceForm().length(),
                         endOffset,
-                        backType);
+                        backType,
+                        metadata);
               } else {
                 compoundToken =
                     new DecompoundToken(
                         morpheme.posTag(),
                         morpheme.surfaceForm(),
                         token.getStartOffset(),
                         token.getEndOffset(),
-                        backType);
+                        backType,
+                        metadata);
               }
               if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
                 compoundToken.setPositionIncrement(0);
@@ -297,6 +306,7 @@ protected void backtrace(Position endPosData, int fromIDX) {
             }
             if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
               token.setPositionLength(Math.max(1, posLen));
+              token.setMetadata(metadata);
               pending.add(token);
               if (VERBOSE) {
                 System.out.println("    add token=" + pending.get(pending.size() - 1));

diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/KoMorphData.java
@@ -22,7 +22,11 @@
 /** Represents Korean morphological information. */
 public interface KoMorphData extends MorphData {
   /** A morpheme extracted from a compound token. */
-  record Morpheme(POS.Tag posTag, String surfaceForm) {}
+  record Morpheme(POS.Tag posTag, String surfaceForm, String metadata) {
+    public Morpheme(POS.Tag posTag, String surfaceForm) {
+      this(posTag, surfaceForm, null);
+    }
+  }
 
   /**
    * Get the {@link org.apache.lucene.analysis.ko.POS.Type} of specified word (morpheme, compound,

diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
@@ -34,6 +34,7 @@
  * (세종시 세종 시).
  */
 public final class UserDictionary implements Dictionary<UserMorphData> {
+  public static final String METADATA_SEPARATOR = " >> ";
   // text -> wordID
   private final TokenInfoFST fst;
 
@@ -82,10 +83,19 @@ private UserDictionary(List<String> entries) throws IOException {
 
     String lastToken = null;
     List<int[]> _segmentations = new ArrayList<>(entries.size());
+    List<String> _metadatas = new ArrayList<>(entries.size());
     short[] rightIds = new short[entries.size()];
     long ord = 0;
     int entryIndex = 0;
     for (String entry : entries) {
+      if (entry.contains(METADATA_SEPARATOR)) {
+        var split = entry.split(METADATA_SEPARATOR);
+        entry = split[0];
+        var metadata = split[1];
+        _metadatas.add(metadata);
+      } else {
+        _metadatas.add(null);
+      }
       String[] splits = entry.split("\\s+");
       String token = splits[0];
       if (token.equals(lastToken)) {
@@ -138,7 +148,8 @@ private UserDictionary(List<String> entries) throws IOException {
     this.fst =
         new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
     int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]);
-    this.morphAtts = new UserMorphData(segmentations, rightIds);
+    String[] metadatas = _metadatas.toArray(String[]::new);
+    this.morphAtts = new UserMorphData(segmentations, rightIds, metadatas);
   }
 
   public TokenInfoFST getFST() {

diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserMorphData.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserMorphData.java
@@ -19,7 +19,7 @@
 import org.apache.lucene.analysis.ko.POS;
 
 /** Morphological information for user dictionary. */
-final class UserMorphData implements KoMorphData {
+public final class UserMorphData implements KoMorphData {
   private static final int WORD_COST = -100000;
 
   // NNG left
@@ -28,12 +28,18 @@ final class UserMorphData implements KoMorphData {
   // length, length... indexed by compound ID or null for simple noun
   private final int[][] segmentations;
   private final short[] rightIds;
+  public String[] metadatas;
 
   UserMorphData(int[][] segmentations, short[] rightIds) {
     this.segmentations = segmentations;
     this.rightIds = rightIds;
   }
 
+  UserMorphData(int[][] segmentations, short[] rightIds, String[] metadatas) {
+    this(segmentations, rightIds);
+    this.metadatas = metadatas;
+  }
+
   @Override
   public int getLeftId(int morphId) {
     return LEFT_ID;
@@ -79,10 +85,12 @@ public Morpheme[] getMorphemes(int morphId, char[] surfaceForm, int off, int len
     if (segs == null) {
       return null;
     }
+    String metadata = metadatas[morphId];
     int offset = 0;
     Morpheme[] morphemes = new Morpheme[segs.length];
     for (int i = 0; i < segs.length; i++) {
-      morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]));
+      morphemes[i] =
+          new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]), metadata);
       offset += segs[i];
     }
     return morphemes;

diff --git a/...alysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/MetadataAttribute.java b/...alysis/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/MetadataAttribute.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko.tokenattributes;
+
+import org.apache.lucene.analysis.ko.Token;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute for Korean token metadata.
+ *
+ * <p>This attribute provides access to additional metadata associated with Korean tokens,
+ * particularly from user dictionaries and compound word morphemes.
+ *
+ * <p>Note: in some cases this value may not be applicable, and will be null.
+ *
+ * @lucene.experimental
+ */
+public interface MetadataAttribute extends Attribute {
+  /** Get the metadata string of the token. */
+  String getMetadata();
+
+  /** Set the current token. */
+  void setToken(Token token);
+}
diff --git a/...is/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/MetadataAttributeImpl.java b/...is/nori/src/java/org/apache/lucene/analysis/ko/tokenattributes/MetadataAttributeImpl.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko.tokenattributes;
+
+import org.apache.lucene.analysis.ko.Token;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/**
+ * Attribute implementation for Korean token metadata.
+ *
+ * @lucene.experimental
+ */
+public class MetadataAttributeImpl extends AttributeImpl implements MetadataAttribute {
+  private Token token;
+
+  @Override
+  public String getMetadata() {
+    if (this.token != null) {
+      return this.token.getMetadata();
+    }
+    return null;
+  }
+
+  @Override
+  public void setToken(Token token) {
+    this.token = token;
+  }
+
+  @Override
+  public void clear() {
+    this.token = null;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    final MetadataAttribute t = (MetadataAttribute) target;
+    t.setToken(this.token);
+  }
+
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    reflector.reflect(MetadataAttribute.class, "metadata", getMetadata());
+  }
+}
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@@ -28,6 +28,7 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
 import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.ko.tokenattributes.MetadataAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
@@ -576,6 +577,27 @@ public void testDuplicate() throws IOException {
     }
   }
 
+  public void testMetadataAttribute() throws IOException {
+    assertMetadata(analyzer, "자바", "컴퓨터 언어");
+    assertMetadata(analyzer, "java", "컴퓨터 언어");
+    assertMetadata(analyzer, "엘라스틱서치", "검색 엔진");
+
+    assertMetadata(analyzerDecompoundKeep, "엘라스틱서치", "검색 엔진");
+  }
+
+  private void assertMetadata(Analyzer analyzer, String input, String metadata) throws IOException {
+    try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
+      final MetadataAttribute metadataAtt = ts.addAttribute(MetadataAttribute.class);
+      ts.reset();
+      while (ts.incrementToken()) {
+        assertNotNull(metadataAtt.getMetadata());
+        assertEquals(metadata, metadataAtt.getMetadata());
+      }
+      assertFalse(ts.incrementToken());
+      ts.end();
+    }
+  }
+
   private void assertReadings(Analyzer analyzer, String input, String... readings)
       throws IOException {
     try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,6 +37,8 @@ New Features @@
     * GITHUB#14792: Introduced OffHeapQuantizedFloatVectorValues class to access float vectors when only quantized byte vectors are available in the index. (Pulkit Gupta)
+    * GITHUB#14969: Add metadata support to Nori Korean analyzer tokens, allowing users to attach additional information to dictionary words. (twosom)
     Improvements
     ---------------------
@@ Expand Down @@