Main functions now return objects

Nito · Nito · commit 78b33b9f2196 · 2023-08-20T00:36:37.000+02:00
diff --git a/eld/languageDetector.py b/eld/languageDetector.py
@@ -0,0 +1,140 @@
+"""
+Copyright 2023 Nito T.M.
+Author URL: https://github.com/nitotm
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import regex as re
+
+from .languageData import languageData
+from .languageSubset import LanguageSubset
+from .languageResult import LanguageResult
+
+
+class LanguageDetector(LanguageSubset):
+    def __init__(self, subset_file=''):
+        super().__init__()
+        languageData.load_ngrams(subset_file)
+        self.__do_clean_text = False
+        self.VERSION = '1.0.3'  # Has to match setup.py version
+
+    """
+    Returns the language detected for a given UTF-8 string, as an ISO 639-1 code
+     LanguageResult object { language = 'es', scores() = {'es': 0.5, 'et': 0.2}, is_reliable() = True }
+     LanguageResult object { language = None|str, scores() = None|dict, is_reliable() = bool }
+    """
+    def detect(self, text):
+        if self.__do_clean_text:
+            # Removes Urls, emails, alphanumerical & numbers
+            text = get_clean_txt(text)
+        text = _normalize_text(text)
+        txt_ngrams = _get_byte_ngrams(text)
+        num_ngrams = len(txt_ngrams)
+        results = _calculate_scores(txt_ngrams, num_ngrams)
+
+        if results:
+            if self.subset:
+                results = LanguageSubset._filter_lang_subset(self, results)
+            results.sort(key=lambda x: -x[1])
+            return LanguageResult(results, num_ngrams)
+        return LanguageResult()
+
+    def clean_text(self, set_bool):
+        self.__do_clean_text = (True if set_bool else False)
+
+
+def _tokenizer(txt):
+    return filter(None, re.split(b'\x20', txt))
+
+
+# Removes parts of a string, that may be considered as "noise" for language detection
+def get_clean_txt(txt):
+    # Remove URLS
+    txt = re.sub(r'[hw]((ttps?://(www\.)?)|ww\.)([^\s/?\.#-]+\.?)+(\/\S*)?', ' ', txt, flags=re.IGNORECASE)
+    # Remove emails
+    txt = re.sub(r'[a-zA-Z0-9.!$%&?+_`-]+@[A-Za-z0-9.-]+\.[A-Za-z0-9-]{2,64}', ' ', txt)
+    # Remove .com domains
+    txt = re.sub(r'([A-Za-z0-9-]+\.)+com(\/\S*|[^\pL])', ' ', txt)
+    # Remove alphanumerical/number codes
+    txt = re.sub(r'[a-zA-Z]*[0-9]+[a-zA-Z0-9]*', ' ', txt)
+    return txt
+
+
+def _normalize_text(text):
+    # Normalize special characters/word separators
+    text = re.sub(r'[^\pL]+(?<![\x27\x60\x2019])', ' ', text[:1000], flags=re.UNICODE).strip()
+    text = text.lower()
+    text = bytes(text, 'utf-8')
+    this_length = len(text)
+
+    if this_length > 350:
+        # Cut to first whitespace after 350 byte length offset
+        text = text[0:min(380, (text.find(b'\x20', 350) or 350))]
+    return text
+
+
+# Calculate scores for each language from the given Ngrams
+def _calculate_scores(txt_ngrams, num_ngrams):
+    lang_score = languageData.lang_score[:]
+    for bytes_, frequency in txt_ngrams.items():
+        if bytes_ in languageData.ngrams:
+            lang_count = len(languageData.ngrams[bytes_])
+            # Ngram score multiplier, the fewer languages found the more relevancy. Formula can be fine-tuned.
+            if lang_count == 1:
+                relevancy = 27
+            elif lang_count < 16:
+                relevancy = (16 - lang_count) / 2 + 1
+            else:
+                relevancy = 1
+
+            # Most time-consuming loop, do only the strictly necessary inside
+            for lang, globalFrequency in languageData.ngrams[bytes_].items():
+                lang_score[lang] += (globalFrequency / frequency if frequency > globalFrequency
+                                     else frequency / globalFrequency) * relevancy + 2
+    # This divisor will produce a final score between 0 - ~1, score could be >1. Can be improved.
+    result_divisor = num_ngrams * 3.2
+    results = []
+    for lang in range(len(lang_score)):
+        if lang_score[lang]:
+            results.append([lang, lang_score[lang] / result_divisor])  # * languageData.scoreNormalizer[lang]
+    return results
+
+
+# Gets Ngrams from a given string.
+def _get_byte_ngrams(txt):
+    byte_grams = {}
+    count_ngrams = 0
+
+    for word in _tokenizer(txt):
+        length = len(word)
+
+        if length > 70:
+            length = 70
+        x = 0
+        for j in range(0, length - 4, 3):
+            this_bytes = (b' ' if j == 0 else b'') + word[j:j + 4]
+            byte_grams[this_bytes] = (1 + byte_grams[this_bytes] if this_bytes in byte_grams else 1)
+            count_ngrams += 1
+            x = 1
+
+        this_bytes = (b' ' if x == 0 else b'') + word[length - 4 if length != 3 else 0:] + b' '
+        byte_grams[this_bytes] = (1 + byte_grams[this_bytes] if this_bytes in byte_grams else 1)
+        count_ngrams += 1
+
+    # Frequency is multiplied by 15000 at the ngrams database. A reduced number seems to work better.
+    # Linear formulas were tried, decreasing the multiplier for fewer ngram strings, no meaningful improvement.
+    for bytes_, count in byte_grams.items():
+        byte_grams[bytes_] = count / count_ngrams * 13200
+
+    return byte_grams
diff --git a/eld/languageResult.py b/eld/languageResult.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2023 Nito T.M.
+Author URL: https://github.com/nitotm
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import json
+from .languageData import languageData
+
+
+class LanguageResult:
+    def __init__(self, results=None, num_ngrams=None):
+        self.language = (languageData.lang_codes[results[0][0]] if results else None)
+        self.__results = results
+        self.__num_ngrams = num_ngrams
+
+    def __str__(self):
+        return json.dumps({'<object>': {
+            'language': self.language,
+            'scores()': self.scores(),
+            'is_reliable()': self.is_reliable()
+        }
+        })
+
+    def scores(self):
+        return _get_scores(self.__results)
+
+    def is_reliable(self):
+        if not self.language or self.__num_ngrams < 3 or not self.__results:
+            return False
+        next_score = (self.__results[1][1] if len(self.__results) > 1 else 0)
+        # A minimum of a 24% from the average score
+        if languageData.avg_score[self.language] * 0.24 > (self.__results[0][1] / self.__num_ngrams) \
+                or 0.01 > abs(self.__results[0][1] - next_score):
+            return False
+        return True
+
+
+def _get_scores(results):
+    scores = {}
+    if results:
+        for value in results:
+            scores[languageData.lang_codes[value[0]]] = value[1]
+    return scores
diff --git a/eld/languageSubset.py b/eld/languageSubset.py
@@ -0,0 +1,165 @@
+"""
+Copyright 2023 Nito T.M.
+Author URL: https://github.com/nitotm
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import hashlib
+import os
+import copy
+import importlib.util
+import logging
+
+from .languageData import languageData
+from .subsetResult import SubsetResult
+
+
+class LanguageSubset:
+
+    def __init__(self):
+        self.subset = None
+        self.defaultNgrams = None
+        self.loadedSubset = None
+
+    # When active, detect() will filter the languages not included at 'subset', from the scores with filterLangSubset()
+    # call dynamic_lang_subset(None) to deactivate
+    def dynamic_lang_subset(self, languages):
+        self.subset = None
+        if languages:
+            self.subset = _make_subset(languages)
+            if self.subset is None:
+                return SubsetResult(False, None, 'No language matched this set')
+        return SubsetResult(True, _iso_languages(self.subset) if self.subset else None)
+
+    # Sets a subset and removes the excluded languages form the ngrams database
+    # if $save option is true, the new ngrams subset will be stored, and cached for next time
+    def lang_subset(self, languages, save=True):
+        if not languages:
+            if self.loadedSubset and self.defaultNgrams:
+                languageData.ngrams = copy.deepcopy(self.defaultNgrams)
+                self.loadedSubset = None
+            return SubsetResult(True)  # if there was already no subset to disable, it also is successful
+
+        lang_array = _make_subset(languages)
+        if not lang_array:
+            return SubsetResult(False, None, 'No language matched this set')
+
+        if self.defaultNgrams is None:
+            self.defaultNgrams = copy.deepcopy(languageData.ngrams)
+
+        langs_str = [str(lang) for lang in lang_array]
+        new_subset = base16_to_base36(
+            hashlib.sha1(','.join(langs_str).encode()).hexdigest()
+        )
+        file_name = 'ngrams' + languageData.type + '-' + str(len(lang_array)) + '_' + new_subset
+        file_path = languageData.folder + 'subset/' + file_name + '.py'
+
+        if self.loadedSubset != new_subset:
+            self.loadedSubset = new_subset
+
+            if os.path.exists(file_path):
+                # module = importlib.import_module('.ngrams.' + file_name, package=file_name)
+                spec = importlib.util.spec_from_file_location(file_name, file_path)
+                module = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(module)
+                languageData.ngrams = module.ngrams_data['ngrams']
+                if languageData.ngrams:
+                    return SubsetResult(True, _iso_languages(lang_array), None, file_path)
+
+            if self.defaultNgrams != languageData.ngrams:
+                languageData.ngrams = copy.deepcopy(self.defaultNgrams)
+
+            for ngram, langsID in self.defaultNgrams.items():
+                for lid, value in langsID.items():
+                    if lid not in lang_array:
+                        del languageData.ngrams[ngram][lid]
+                if not languageData.ngrams[ngram]:
+                    del languageData.ngrams[ngram]
+
+        saved = False
+        if save:
+            saved = _save_ngrams(file_path, lang_array)
+
+        return SubsetResult(True, _iso_languages(lang_array), None, (file_name if saved else None))
+
+    # Filters languages not included in the subset, from the result scores
+    def _filter_lang_subset(self, scores):
+        sub_results = []
+        for score in scores:
+            if score[0] in self.subset:
+                sub_results.append(score)
+        return sub_results
+
+
+def _ngram_export(data):
+    if isinstance(data, dict):
+        to_implode = []
+        for key, value in data.items():
+            to_implode.append(repr(key) + ':' + _ngram_export(value))
+        code = '{' + ','.join(to_implode) + '}'
+        return code
+    else:
+        return repr(data)
+
+
+def _save_ngrams(file_path, lang_array):
+    if not os.path.exists(file_path):  # in case self.loadedSubset != new_subset, and was previously saved
+        try:
+            with open(file_path, 'w') as f:
+                f.write(
+                    '# Copyright 2023 Nito T.M. [ Apache 2.0 Licence https://www.apache.org/licenses/LICENSE-2.0 ]\n' +
+                    'ngrams_data = {\n' +
+                    '   "type": "' + str(languageData.type) + '",\n' +
+                    '   "languages": ' + str(_iso_languages(lang_array)) + ',\n' +
+                    '   "is_subset": True,\n' +
+                    '   "ngrams": ' + _ngram_export(languageData.ngrams) + '\n' +
+                    '}')
+        except Exception as e:
+            logging.exception(e)
+            return False
+    return True
+
+
+def _make_subset(languages):
+    subset = []
+    reverse_langs = {v: k for k, v in languageData.lang_codes.items()}
+    if languages:
+        for lang in languages:
+            found_lang = reverse_langs.get(lang)
+            if found_lang is not None:
+                subset.append(found_lang)
+        subset.sort()
+    return subset or None
+
+
+# Converts ngram database language indexes (integer) to ISO 639-1 code
+def _iso_languages(lang_set):
+    lang_codes = {}
+    for lang_id in lang_set:
+        lang_codes[lang_id] = languageData.lang_codes[lang_id]
+    return lang_codes
+
+
+def base16_to_base36(hex_string):
+    # Convert hex string to integer
+    integer_value = int(hex_string, 16)
+
+    # Convert integer to base-36 string
+    base36_string = ''
+    while integer_value > 0:
+        integer_value, remainder = divmod(integer_value, 36)
+        base36_digit = '0123456789abcdefghijklmnopqrstuvwxyz'[remainder]
+        base36_string = base36_digit + base36_string
+
+    return base36_string
diff --git a/eld/subsetResult.py b/eld/subsetResult.py
@@ -0,0 +1,24 @@
+"""
+Copyright 2023 Nito T.M.
+Author URL: https://github.com/nitotm
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+class SubsetResult:
+    def __init__(self, success, languages=None, error=None, file=None):
+        self.success = success
+        self.languages = list(languages.values()) if languages else None
+        self.error = error
+        self.file = file