Skip to content
This repository was archived by the owner on Oct 29, 2023. It is now read-only.

Commit 4f893a1

Browse files
committed
Update to Elasticsearch 8.3.3
This also meant to remove the lingua implementation due to issues with the security manager.
1 parent 5dbec5e commit 4f893a1

File tree

8 files changed

+27
-118
lines changed

8 files changed

+27
-118
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM docker.elastic.co/elasticsearch/elasticsearch:8.3.2
1+
FROM docker.elastic.co/elasticsearch/elasticsearch:8.3.3
22

33
ADD build/distribution/elasticsearch-ingest-langdetect.zip /elasticsearch-ingest-langdetect.zip
44
RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch file:///elasticsearch-ingest-langdetect.zip

README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,21 @@
11
# Elasticsearch Langdetect Ingest Processor
22

3-
Uses the [langdetect](https://github.com/YouCruit/language-detection/) plugin (or alternatively [lingua](https://github.com/pemistahl/lingua/)) to try to find out the language used in a field.
3+
Uses the [langdetect](https://github.com/YouCruit/language-detection/) plugin.
44

55
Note that Elasticsearch has native support for langdetection nowadays using the
66
`inference` ingest processor. See more in
77
[the documentation](https://www.elastic.co/guide/en/machine-learning/current/ml-lang-ident.html)
88

9+
**Note**: As of Elasticsearch 8.3.3 the lingua implementation has been
10+
removed again due to issues with the security manager. Feel free to check
11+
out previous commits and create a PR if you got it working to include it
12+
again.
13+
914
## Installation
1015

1116
| ES | Command |
1217
| ----- | ------- |
18+
| 8.3.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-langdetect/releases/download/8.3.3.1/ingest-langdetect-8.3.3.1.zip` |
1319
| 8.3.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-langdetect/releases/download/8.3.2.1/ingest-langdetect-8.3.2.1.zip` |
1420
| 8.3.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-langdetect/releases/download/8.3.1.1/ingest-langdetect-8.3.1.1.zip` |
1521
| 8.3.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-langdetect/releases/download/8.3.0.1/ingest-langdetect-8.3.0.1.zip` |
@@ -227,12 +233,6 @@ GET my-index/doc/2
227233
| target_field | Field name to write the language to |
228234
| max_length | Max length of of characters to read, defaults to 10kb, requires a byte size value, like 1mb |
229235
| ignore_missing | Ignore missing source field. Not throwing exception in that case. Expects for boolean value, defaults to false. |
230-
| implementation | **Exists only from 8.0 onwards**: Can be 'lingua' to use the [lingua](https://github.com/pemistahl/lingua/) language detector library, everything else defaults to use the [langdetect](https://github.com/YouCruit/language-detection/) code. |
231-
232-
**Note**: The `lingua` implementation requires **a lot more** more memory
233-
for your nodes having the ingest role. Please test this before using in
234-
production. The memory is only used, once the `lingua` processor is actually
235-
used in a pipeline.
236236

237237
## Setup
238238

build.gradle

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import org.apache.tools.ant.filters.ReplaceTokens
44
plugins {
55
// the old co.riiid.gradle is not gradle 7.0 compatible
66
id "com.github.humblerookie.gradle" version "0.4.4"
7-
id "com.github.ben-manes.versions" version '0.41.0'
7+
id "com.github.ben-manes.versions" version '0.42.0'
88
}
99

1010
repositories {
@@ -35,6 +35,7 @@ task copyDependencies(type: Copy) {
3535
from configurations.default
3636
from 'NOTICE.txt'
3737
from 'LICENSE.txt'
38+
from 'src/main/resources/plugin-security.policy'
3839
}
3940

4041
task packageDistribution(type: Zip) {
@@ -109,21 +110,20 @@ githubRelease.doFirst {
109110
githubRelease.dependsOn 'packageDistribution'
110111

111112
dependencies {
112-
def junitVersion = '5.8.2'
113+
def junitVersion = '5.9.0'
113114

114115
implementation 'com.youcruit.com.cybozu.labs:langdetect:1.1.2-20151117'
115-
implementation 'com.github.pemistahl:lingua:1.1.1'
116116
implementation 'net.arnx:jsonic:1.3.10'
117117
compileOnly "org.elasticsearch:elasticsearch:$elasticsearchVersion"
118118

119119
testImplementation "org.elasticsearch:elasticsearch:$elasticsearchVersion"
120120
testImplementation "co.elastic.clients:elasticsearch-java:$elasticsearchVersion"
121-
testImplementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.2'
122-
testImplementation('org.testcontainers:elasticsearch:1.17.1') {
121+
testImplementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
122+
testImplementation('org.testcontainers:elasticsearch:1.17.3') {
123123
exclude group: 'junit', module: 'junit'
124124
}
125-
testImplementation 'org.testcontainers:junit-jupiter:1.17.1'
126-
testImplementation "org.assertj:assertj-core:3.22.0"
125+
testImplementation 'org.testcontainers:junit-jupiter:1.17.3'
126+
testImplementation 'org.assertj:assertj-core:3.23.1'
127127
testImplementation "org.slf4j:slf4j-simple:1.7.36"
128128
testImplementation "org.junit.jupiter:junit-jupiter-api:${junitVersion}"
129129
testImplementation "org.junit.jupiter:junit-jupiter-params:${junitVersion}"

gradle.properties

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
elasticsearchVersion = 8.3.2
2-
org.gradle.jvmargs=-Xmx4g
1+
elasticsearchVersion = 8.3.3

src/main/java/org/elasticsearch/plugin/ingest/langdetect/IngestLangDetectPlugin.java

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919

2020
import com.cybozu.labs.langdetect.LangDetectException;
2121
import com.cybozu.labs.langdetect.SecureDetectorFactory;
22-
import com.github.pemistahl.lingua.api.LanguageDetector;
23-
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
2422
import org.elasticsearch.ElasticsearchException;
2523
import org.elasticsearch.ingest.Processor;
2624
import org.elasticsearch.plugins.IngestPlugin;
@@ -30,13 +28,9 @@
3028
import java.net.URISyntaxException;
3129
import java.util.HashMap;
3230
import java.util.Map;
33-
import java.util.concurrent.atomic.AtomicReference;
34-
import java.util.function.Supplier;
3531

3632
public class IngestLangDetectPlugin extends Plugin implements IngestPlugin {
3733

38-
private AtomicReference<LanguageDetector> languageDetector = new AtomicReference<>();
39-
4034
@Override
4135
public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
4236
try {
@@ -45,32 +39,8 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
4539
throw new ElasticsearchException(e);
4640
}
4741

48-
// this lazy loads the lingua supplier, as it needs crazy amounts of memory, which should only be used, if the user uses
49-
// the lingua implementation in one of the processors
50-
Supplier<LanguageDetector> supplier = () -> {
51-
final LanguageDetector languageDetector = this.languageDetector.get();
52-
if (languageDetector == null) {
53-
final LanguageDetector detector = LanguageDetectorBuilder.fromAllLanguages().withPreloadedLanguageModels().build();
54-
final boolean updatedSuccessfully = this.languageDetector.compareAndSet(null, detector);
55-
if (updatedSuccessfully == false) {
56-
detector.destroy();
57-
}
58-
return this.languageDetector.get();
59-
}
60-
return languageDetector;
61-
};
62-
6342
Map<String, Processor.Factory> factoryMap = new HashMap<>(1);
64-
factoryMap.put(LangDetectProcessor.TYPE, new LangDetectProcessor.Factory(supplier));
43+
factoryMap.put(LangDetectProcessor.TYPE, new LangDetectProcessor.Factory());
6544
return factoryMap;
6645
}
67-
68-
@Override
69-
public void close() throws IOException {
70-
super.close();
71-
final LanguageDetector detector = this.languageDetector.get();
72-
if (detector != null) {
73-
detector.destroy();
74-
}
75-
}
7646
}

src/main/java/org/elasticsearch/plugin/ingest/langdetect/LangDetectProcessor.java

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919

2020
import com.cybozu.labs.langdetect.Detector;
2121
import com.cybozu.labs.langdetect.DetectorFactory;
22-
import com.github.pemistahl.lingua.api.Language;
23-
import com.github.pemistahl.lingua.api.LanguageDetector;
2422
import org.elasticsearch.common.Strings;
2523
import org.elasticsearch.common.unit.ByteSizeUnit;
2624
import org.elasticsearch.common.unit.ByteSizeValue;
@@ -29,13 +27,9 @@
2927
import org.elasticsearch.ingest.IngestDocument;
3028
import org.elasticsearch.ingest.Processor;
3129

32-
import java.util.Locale;
3330
import java.util.Map;
34-
import java.util.function.Supplier;
3531

36-
import static org.elasticsearch.ingest.ConfigurationUtils.readBooleanProperty;
37-
import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalStringProperty;
38-
import static org.elasticsearch.ingest.ConfigurationUtils.readStringProperty;
32+
import static org.elasticsearch.ingest.ConfigurationUtils.*;
3933

4034
public class LangDetectProcessor extends AbstractProcessor {
4135

@@ -84,40 +78,22 @@ public String getType() {
8478
public static final class Factory implements Processor.Factory {
8579

8680
private static final ByteSizeValue DEFAULT_MAX_LENGTH = new ByteSizeValue(10, ByteSizeUnit.KB);
87-
private final Supplier<LanguageDetector> languageDetector;
88-
89-
public Factory(Supplier<LanguageDetector> languageDetector) {
90-
this.languageDetector = languageDetector;
91-
}
9281

9382
@Override
9483
public Processor create(Map<String, Processor.Factory> processorFactories, String tag, String description,
9584
Map<String, Object> config) throws Exception {
9685
String field = readStringProperty(TYPE, tag, config, "field");
9786
String targetField = readStringProperty(TYPE, tag, config, "target_field");
9887
String maxLengthStr = readOptionalStringProperty(TYPE, tag, config, "max_length");
99-
String implementation = readOptionalStringProperty(TYPE, tag, config, "implementation");
10088
ByteSizeValue maxLength = ByteSizeValue.parseBytesSizeValue(maxLengthStr, DEFAULT_MAX_LENGTH, "max_length");
10189
boolean ignoreMissing = readBooleanProperty(TYPE, tag, config, "ignore_missing", false);
10290

103-
CheckedFunction<String, String, Exception> langDetector;
104-
if ("lingua".equals(implementation)) {
105-
langDetector = input -> {
106-
if (maxLength != null && input.length() > maxLength.getBytes()) {
107-
input = input.substring(0, Long.valueOf(maxLength.getBytes()).intValue());
108-
}
109-
110-
Language detectedLanguage = languageDetector.get().detectLanguageOf(input);
111-
return detectedLanguage.getIsoCode639_1().name().toLowerCase(Locale.ROOT);
112-
};
113-
} else {
114-
langDetector = input -> {
115-
Detector detector = DetectorFactory.create();
116-
detector.setMaxTextLength(Long.valueOf(maxLength.getBytes()).intValue());
117-
detector.append(input);
118-
return detector.detect();
119-
};
120-
}
91+
CheckedFunction<String, String, Exception> langDetector = input -> {
92+
Detector detector = DetectorFactory.create();
93+
detector.setMaxTextLength(Long.valueOf(maxLength.getBytes()).intValue());
94+
detector.append(input);
95+
return detector.detect();
96+
};
12197

12298
return new LangDetectProcessor(tag, description, field, targetField, ignoreMissing, langDetector);
12399
}

src/test/java/org/elasticsearch/plugin/ingest/langdetect/LangDetectProcessorIntegrationTests.java

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@
1111
import co.elastic.clients.transport.rest_client.RestClientTransport;
1212
import org.apache.http.HttpHost;
1313
import org.elasticsearch.client.RestClient;
14-
import org.junit.jupiter.api.AfterAll;
15-
import org.junit.jupiter.api.BeforeAll;
16-
import org.junit.jupiter.api.Disabled;
17-
import org.junit.jupiter.api.Tag;
18-
import org.junit.jupiter.api.Test;
14+
import org.junit.jupiter.api.*;
1915
import org.slf4j.LoggerFactory;
2016
import org.testcontainers.containers.GenericContainer;
2117
import org.testcontainers.containers.output.Slf4jLogConsumer;
@@ -47,7 +43,7 @@ public static void startContainer() {
4743
container.withEnv("xpack.security.enabled", "false");
4844
container.withEnv("ES_JAVA_OPTS", "-Xms4g -Xmx4g");
4945
container.addExposedPorts(9200);
50-
container.setWaitStrategy(new LogMessageWaitStrategy().withRegEx(".*(\"message\":\\s?\"started[\"| ].*|] started\n$)"));
46+
container.setWaitStrategy(new LogMessageWaitStrategy().withRegEx(".*(\"message\":\\s?\"started[\\s?|\"].*|] started\n$)"));
5147

5248
container.start();
5349
container.followOutput(new Slf4jLogConsumer(LoggerFactory.getLogger(LangDetectProcessorIntegrationTests.class)));
@@ -92,13 +88,6 @@ public void testLangDetectProcessorInPipeline() throws Exception {
9288
"field" : "field1",
9389
"target_field" : "field1_language"
9490
}
95-
},
96-
{
97-
"langdetect" : {
98-
"field" : "field1",
99-
"target_field" : "field1_lingua",
100-
"implementation" : "lingua"
101-
}
10291
}
10392
]
10493
}
@@ -122,6 +111,5 @@ public void testLangDetectProcessorInPipeline() throws Exception {
122111
GetResponse<Map> getResponse = client.get(b -> b.index("test").id("1"), Map.class);
123112
Map<String, Object> source = getResponse.source();
124113
assertThat(source).containsEntry("field1_language", "en");
125-
assertThat(source).containsEntry("field1_lingua", "en");
126114
}
127115
}

src/test/java/org/elasticsearch/plugin/ingest/langdetect/LangDetectProcessorTests.java

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,10 @@
1919

2020
import com.cybozu.labs.langdetect.LangDetectException;
2121
import com.cybozu.labs.langdetect.SecureDetectorFactory;
22-
import com.github.pemistahl.lingua.api.Language;
23-
import com.github.pemistahl.lingua.api.LanguageDetector;
24-
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
2522
import org.elasticsearch.common.settings.Settings;
2623
import org.elasticsearch.env.Environment;
2724
import org.elasticsearch.ingest.IngestDocument;
2825
import org.elasticsearch.ingest.Processor;
29-
import org.junit.jupiter.api.AfterAll;
3026
import org.junit.jupiter.api.BeforeAll;
3127
import org.junit.jupiter.api.Test;
3228
import org.junit.jupiter.api.io.TempDir;
@@ -41,8 +37,6 @@
4137

4238
public class LangDetectProcessorTests {
4339

44-
private static LanguageDetector languageDetector;
45-
4640
@TempDir
4741
public static Path folder;
4842

@@ -51,14 +45,6 @@ public static void loadProfiles() throws Exception {
5145
Settings settings = Settings.builder().put("path.home", folder).build();
5246
Environment environment = new Environment(settings, folder);
5347
SecureDetectorFactory.loadProfileFromClassPath(environment);
54-
55-
// instead of loading all languages, reduce this to the minimum to keep the test fast!
56-
languageDetector = LanguageDetectorBuilder.fromLanguages(Language.ENGLISH, Language.GERMAN).build();
57-
}
58-
59-
@AfterAll
60-
public static void stopLanguageDetector() {
61-
languageDetector.destroy();
6248
}
6349

6450
@Test
@@ -69,16 +55,6 @@ public void testThatProcessorWorks() throws Exception {
6955
assertThat(data).containsEntry("language", "en");
7056
}
7157

72-
@Test
73-
public void testThatLinguaImplementationWorks() throws Exception {
74-
final Map<String, Object> config = config("source_field", "language", false);
75-
config.put("implementation", "lingua");
76-
Map<String, Object> data = ingestDocument(config,
77-
"source_field", "This is hopefully an english text, that will be detected.");
78-
79-
assertThat(data).containsEntry("language", "en");
80-
}
81-
8258
@Test
8359
public void testMaxLengthConfiguration() throws Exception {
8460
Map<String, Object> config = config("source_field", "language", false);
@@ -126,7 +102,7 @@ private Map<String, Object> ingestDocument(Map<String, Object> config, String fi
126102
document.put(field, value);
127103
IngestDocument ingestDocument = new IngestDocument(document, Collections.emptyMap());
128104

129-
Processor processor = new LangDetectProcessor.Factory(() -> languageDetector)
105+
Processor processor = new LangDetectProcessor.Factory()
130106
.create(Collections.emptyMap(), "my-tag", "desc", config);
131107
return processor.execute(ingestDocument).getSourceAndMetadata();
132108
}

0 commit comments

Comments
 (0)