Skip to content

Commit 78b33b9

Browse files
author
Nito
committed
Main functions now return objects
1 parent de0d471 commit 78b33b9

File tree

4 files changed

+383
-0
lines changed

4 files changed

+383
-0
lines changed

eld/languageDetector.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
"""
2+
Copyright 2023 Nito T.M.
3+
Author URL: https://github.com/nitotm
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
"""
17+
18+
import regex as re
19+
20+
from .languageData import languageData
21+
from .languageSubset import LanguageSubset
22+
from .languageResult import LanguageResult
23+
24+
25+
class LanguageDetector(LanguageSubset):
26+
def __init__(self, subset_file=''):
27+
super().__init__()
28+
languageData.load_ngrams(subset_file)
29+
self.__do_clean_text = False
30+
self.VERSION = '1.0.3' # Has to match setup.py version
31+
32+
"""
33+
Returns the language detected for a given UTF-8 string, as an ISO 639-1 code
34+
LanguageResult object { language = 'es', scores() = {'es': 0.5, 'et': 0.2}, is_reliable() = True }
35+
LanguageResult object { language = None|str, scores() = None|dict, is_reliable() = bool }
36+
"""
37+
def detect(self, text):
38+
if self.__do_clean_text:
39+
# Removes Urls, emails, alphanumerical & numbers
40+
text = get_clean_txt(text)
41+
text = _normalize_text(text)
42+
txt_ngrams = _get_byte_ngrams(text)
43+
num_ngrams = len(txt_ngrams)
44+
results = _calculate_scores(txt_ngrams, num_ngrams)
45+
46+
if results:
47+
if self.subset:
48+
results = LanguageSubset._filter_lang_subset(self, results)
49+
results.sort(key=lambda x: -x[1])
50+
return LanguageResult(results, num_ngrams)
51+
return LanguageResult()
52+
53+
def clean_text(self, set_bool):
54+
self.__do_clean_text = (True if set_bool else False)
55+
56+
57+
def _tokenizer(txt):
58+
return filter(None, re.split(b'\x20', txt))
59+
60+
61+
# Removes parts of a string, that may be considered as "noise" for language detection
62+
def get_clean_txt(txt):
63+
# Remove URLS
64+
txt = re.sub(r'[hw]((ttps?://(www\.)?)|ww\.)([^\s/?\.#-]+\.?)+(\/\S*)?', ' ', txt, flags=re.IGNORECASE)
65+
# Remove emails
66+
txt = re.sub(r'[a-zA-Z0-9.!$%&?+_`-]+@[A-Za-z0-9.-]+\.[A-Za-z0-9-]{2,64}', ' ', txt)
67+
# Remove .com domains
68+
txt = re.sub(r'([A-Za-z0-9-]+\.)+com(\/\S*|[^\pL])', ' ', txt)
69+
# Remove alphanumerical/number codes
70+
txt = re.sub(r'[a-zA-Z]*[0-9]+[a-zA-Z0-9]*', ' ', txt)
71+
return txt
72+
73+
74+
def _normalize_text(text):
75+
# Normalize special characters/word separators
76+
text = re.sub(r'[^\pL]+(?<![\x27\x60\x2019])', ' ', text[:1000], flags=re.UNICODE).strip()
77+
text = text.lower()
78+
text = bytes(text, 'utf-8')
79+
this_length = len(text)
80+
81+
if this_length > 350:
82+
# Cut to first whitespace after 350 byte length offset
83+
text = text[0:min(380, (text.find(b'\x20', 350) or 350))]
84+
return text
85+
86+
87+
# Calculate scores for each language from the given Ngrams
88+
def _calculate_scores(txt_ngrams, num_ngrams):
89+
lang_score = languageData.lang_score[:]
90+
for bytes_, frequency in txt_ngrams.items():
91+
if bytes_ in languageData.ngrams:
92+
lang_count = len(languageData.ngrams[bytes_])
93+
# Ngram score multiplier, the fewer languages found the more relevancy. Formula can be fine-tuned.
94+
if lang_count == 1:
95+
relevancy = 27
96+
elif lang_count < 16:
97+
relevancy = (16 - lang_count) / 2 + 1
98+
else:
99+
relevancy = 1
100+
101+
# Most time-consuming loop, do only the strictly necessary inside
102+
for lang, globalFrequency in languageData.ngrams[bytes_].items():
103+
lang_score[lang] += (globalFrequency / frequency if frequency > globalFrequency
104+
else frequency / globalFrequency) * relevancy + 2
105+
# This divisor will produce a final score between 0 - ~1, score could be >1. Can be improved.
106+
result_divisor = num_ngrams * 3.2
107+
results = []
108+
for lang in range(len(lang_score)):
109+
if lang_score[lang]:
110+
results.append([lang, lang_score[lang] / result_divisor]) # * languageData.scoreNormalizer[lang]
111+
return results
112+
113+
114+
# Gets Ngrams from a given string.
115+
def _get_byte_ngrams(txt):
116+
byte_grams = {}
117+
count_ngrams = 0
118+
119+
for word in _tokenizer(txt):
120+
length = len(word)
121+
122+
if length > 70:
123+
length = 70
124+
x = 0
125+
for j in range(0, length - 4, 3):
126+
this_bytes = (b' ' if j == 0 else b'') + word[j:j + 4]
127+
byte_grams[this_bytes] = (1 + byte_grams[this_bytes] if this_bytes in byte_grams else 1)
128+
count_ngrams += 1
129+
x = 1
130+
131+
this_bytes = (b' ' if x == 0 else b'') + word[length - 4 if length != 3 else 0:] + b' '
132+
byte_grams[this_bytes] = (1 + byte_grams[this_bytes] if this_bytes in byte_grams else 1)
133+
count_ngrams += 1
134+
135+
# Frequency is multiplied by 15000 at the ngrams database. A reduced number seems to work better.
136+
# Linear formulas were tried, decreasing the multiplier for fewer ngram strings, no meaningful improvement.
137+
for bytes_, count in byte_grams.items():
138+
byte_grams[bytes_] = count / count_ngrams * 13200
139+
140+
return byte_grams

eld/languageResult.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Copyright 2023 Nito T.M.
3+
Author URL: https://github.com/nitotm
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
"""
17+
import json
18+
from .languageData import languageData
19+
20+
21+
class LanguageResult:
22+
def __init__(self, results=None, num_ngrams=None):
23+
self.language = (languageData.lang_codes[results[0][0]] if results else None)
24+
self.__results = results
25+
self.__num_ngrams = num_ngrams
26+
27+
def __str__(self):
28+
return json.dumps({'<object>': {
29+
'language': self.language,
30+
'scores()': self.scores(),
31+
'is_reliable()': self.is_reliable()
32+
}
33+
})
34+
35+
def scores(self):
36+
return _get_scores(self.__results)
37+
38+
def is_reliable(self):
39+
if not self.language or self.__num_ngrams < 3 or not self.__results:
40+
return False
41+
next_score = (self.__results[1][1] if len(self.__results) > 1 else 0)
42+
# A minimum of a 24% from the average score
43+
if languageData.avg_score[self.language] * 0.24 > (self.__results[0][1] / self.__num_ngrams) \
44+
or 0.01 > abs(self.__results[0][1] - next_score):
45+
return False
46+
return True
47+
48+
49+
def _get_scores(results):
50+
scores = {}
51+
if results:
52+
for value in results:
53+
scores[languageData.lang_codes[value[0]]] = value[1]
54+
return scores

eld/languageSubset.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
"""
2+
Copyright 2023 Nito T.M.
3+
Author URL: https://github.com/nitotm
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
"""
17+
18+
import hashlib
19+
import os
20+
import copy
21+
import importlib.util
22+
import logging
23+
24+
from .languageData import languageData
25+
from .subsetResult import SubsetResult
26+
27+
28+
class LanguageSubset:
29+
30+
def __init__(self):
31+
self.subset = None
32+
self.defaultNgrams = None
33+
self.loadedSubset = None
34+
35+
# When active, detect() will filter the languages not included at 'subset', from the scores with filterLangSubset()
36+
# call dynamic_lang_subset(None) to deactivate
37+
def dynamic_lang_subset(self, languages):
38+
self.subset = None
39+
if languages:
40+
self.subset = _make_subset(languages)
41+
if self.subset is None:
42+
return SubsetResult(False, None, 'No language matched this set')
43+
return SubsetResult(True, _iso_languages(self.subset) if self.subset else None)
44+
45+
# Sets a subset and removes the excluded languages form the ngrams database
46+
# if $save option is true, the new ngrams subset will be stored, and cached for next time
47+
def lang_subset(self, languages, save=True):
48+
if not languages:
49+
if self.loadedSubset and self.defaultNgrams:
50+
languageData.ngrams = copy.deepcopy(self.defaultNgrams)
51+
self.loadedSubset = None
52+
return SubsetResult(True) # if there was already no subset to disable, it also is successful
53+
54+
lang_array = _make_subset(languages)
55+
if not lang_array:
56+
return SubsetResult(False, None, 'No language matched this set')
57+
58+
if self.defaultNgrams is None:
59+
self.defaultNgrams = copy.deepcopy(languageData.ngrams)
60+
61+
langs_str = [str(lang) for lang in lang_array]
62+
new_subset = base16_to_base36(
63+
hashlib.sha1(','.join(langs_str).encode()).hexdigest()
64+
)
65+
file_name = 'ngrams' + languageData.type + '-' + str(len(lang_array)) + '_' + new_subset
66+
file_path = languageData.folder + 'subset/' + file_name + '.py'
67+
68+
if self.loadedSubset != new_subset:
69+
self.loadedSubset = new_subset
70+
71+
if os.path.exists(file_path):
72+
# module = importlib.import_module('.ngrams.' + file_name, package=file_name)
73+
spec = importlib.util.spec_from_file_location(file_name, file_path)
74+
module = importlib.util.module_from_spec(spec)
75+
spec.loader.exec_module(module)
76+
languageData.ngrams = module.ngrams_data['ngrams']
77+
if languageData.ngrams:
78+
return SubsetResult(True, _iso_languages(lang_array), None, file_path)
79+
80+
if self.defaultNgrams != languageData.ngrams:
81+
languageData.ngrams = copy.deepcopy(self.defaultNgrams)
82+
83+
for ngram, langsID in self.defaultNgrams.items():
84+
for lid, value in langsID.items():
85+
if lid not in lang_array:
86+
del languageData.ngrams[ngram][lid]
87+
if not languageData.ngrams[ngram]:
88+
del languageData.ngrams[ngram]
89+
90+
saved = False
91+
if save:
92+
saved = _save_ngrams(file_path, lang_array)
93+
94+
return SubsetResult(True, _iso_languages(lang_array), None, (file_name if saved else None))
95+
96+
# Filters languages not included in the subset, from the result scores
97+
def _filter_lang_subset(self, scores):
98+
sub_results = []
99+
for score in scores:
100+
if score[0] in self.subset:
101+
sub_results.append(score)
102+
return sub_results
103+
104+
105+
def _ngram_export(data):
106+
if isinstance(data, dict):
107+
to_implode = []
108+
for key, value in data.items():
109+
to_implode.append(repr(key) + ':' + _ngram_export(value))
110+
code = '{' + ','.join(to_implode) + '}'
111+
return code
112+
else:
113+
return repr(data)
114+
115+
116+
def _save_ngrams(file_path, lang_array):
117+
if not os.path.exists(file_path): # in case self.loadedSubset != new_subset, and was previously saved
118+
try:
119+
with open(file_path, 'w') as f:
120+
f.write(
121+
'# Copyright 2023 Nito T.M. [ Apache 2.0 Licence https://www.apache.org/licenses/LICENSE-2.0 ]\n' +
122+
'ngrams_data = {\n' +
123+
' "type": "' + str(languageData.type) + '",\n' +
124+
' "languages": ' + str(_iso_languages(lang_array)) + ',\n' +
125+
' "is_subset": True,\n' +
126+
' "ngrams": ' + _ngram_export(languageData.ngrams) + '\n' +
127+
'}')
128+
except Exception as e:
129+
logging.exception(e)
130+
return False
131+
return True
132+
133+
134+
def _make_subset(languages):
135+
subset = []
136+
reverse_langs = {v: k for k, v in languageData.lang_codes.items()}
137+
if languages:
138+
for lang in languages:
139+
found_lang = reverse_langs.get(lang)
140+
if found_lang is not None:
141+
subset.append(found_lang)
142+
subset.sort()
143+
return subset or None
144+
145+
146+
# Converts ngram database language indexes (integer) to ISO 639-1 code
147+
def _iso_languages(lang_set):
148+
lang_codes = {}
149+
for lang_id in lang_set:
150+
lang_codes[lang_id] = languageData.lang_codes[lang_id]
151+
return lang_codes
152+
153+
154+
def base16_to_base36(hex_string):
155+
# Convert hex string to integer
156+
integer_value = int(hex_string, 16)
157+
158+
# Convert integer to base-36 string
159+
base36_string = ''
160+
while integer_value > 0:
161+
integer_value, remainder = divmod(integer_value, 36)
162+
base36_digit = '0123456789abcdefghijklmnopqrstuvwxyz'[remainder]
163+
base36_string = base36_digit + base36_string
164+
165+
return base36_string

eld/subsetResult.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""
2+
Copyright 2023 Nito T.M.
3+
Author URL: https://github.com/nitotm
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
"""
17+
18+
19+
class SubsetResult:
20+
def __init__(self, success, languages=None, error=None, file=None):
21+
self.success = success
22+
self.languages = list(languages.values()) if languages else None
23+
self.error = error
24+
self.file = file

0 commit comments

Comments
 (0)