Folders restructure

Nito · Nito · commit de0d471df9bf · 2023-08-20T00:35:25.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+__pycache__/
+*.pyc
+/.idea/
+build/
+dist/
+*.egg-info/
+*.egg
diff --git a/eld/__init__.py b/eld/__init__.py
@@ -0,0 +1 @@
+from .languageDetector import LanguageDetector
diff --git a/eld/languageData.py b/eld/languageData.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2023 Nito T.M.
+Author URL: https://github.com/nitotm
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import importlib.util
+import os
+
+
+class LanguageData:
+    def __init__(self):
+        from .resources.avg_score import avg_score
+        self.avg_score = avg_score
+        self.ngrams = {}
+        self.lang_score = []
+        self.lang_codes = {}
+        self.type = ''
+        self.folder = os.path.dirname(__file__) + '/resources/ngrams/'
+
+    """ 
+    ISO 639-1 codes, for the 60 languages set.
+    ['am', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gu',
+    'he', 'hi', 'hr', 'hu', 'hy', 'is', 'it', 'ja', 'ka', 'kn', 'ko', 'ku', 'lo', 'lt', 'lv', 'ml', 'mr', 'ms', 'nl',
+    'no', 'or', 'pa', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur',
+    'vi', 'yo', 'zh']
+          
+    ['Amharic', 'Arabic', 'Azerbaijani (Latin)', 'Belarusian', 'Bulgarian', 'Bengali', 'Catalan', 'Czech', 'Danish',
+    'German', 'Greek', 'English', 'Spanish', 'Estonian', 'Basque', 'Persian', 'Finnish', 'French', 'Gujarati', 'Hebrew',
+    'Hindi', 'Croatian', 'Hungarian', 'Armenian', 'Icelandic', 'Italian', 'Japanese', 'Georgian', 'Kannada', 'Korean',
+    'Kurdish (Arabic)', 'Lao', 'Lithuanian', 'Latvian', 'Malayalam', 'Marathi', 'Malay (Latin)', 'Dutch', 'Norwegian',
+    'Oriya', 'Punjabi', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Slovak', 'Slovene', 'Albanian',
+    'Serbian (Cyrillic)', 'Swedish', 'Tamil', 'Telugu', 'Thai', 'Tagalog', 'Turkish', 'Ukrainian', 'Urdu', 'Vietnamese',
+    'Yoruba', 'Chinese']
+    """
+
+    def load_ngrams(self, subset_file=''):
+        if subset_file == '':
+            from .resources.ngrams.ngramsM60 import ngrams_data
+        else:
+            # module = importlib.import_module('.ngrams.' + subset_file)
+            file_path = self.folder + subset_file + '.py'
+            if not os.path.exists(file_path):
+                file_path = self.folder + 'subset/' + subset_file + '.py'
+            spec = importlib.util.spec_from_file_location(subset_file, file_path)
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            ngrams_data = module.ngrams_data
+
+        self.ngrams = ngrams_data['ngrams']
+        self.lang_score = [0] * (max(ngrams_data['languages'].keys()) + 1)
+        self.type = ngrams_data['type']
+        self.lang_codes = ngrams_data['languages']
+
+
+languageData = LanguageData()
diff --git a/eld/resources/avg_score.py b/eld/resources/avg_score.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2023 Nito T.M.
+Author URL: https://github.com/nitotm
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+avg_score = {'am': 0.0661, 'ar': 0.0237, 'az': 0.0269, 'be': 0.0227, 'bg': 0.0234, 'bn': 0.1373, 'ca': 0.0246,
+             'cs': 0.0242, 'da': 0.0277, 'de': 0.0275, 'el': 0.0369, 'en': 0.0378, 'es': 0.0252, 'et': 0.0253,
+             'eu': 0.0369, 'fa': 0.0213, 'fi': 0.026, 'fr': 0.0253, 'gu': 0.1197, 'he': 0.0402, 'hi': 0.0578,
+             'hr': 0.0201, 'hu': 0.0208, 'hy': 0.0439, 'is': 0.032, 'it': 0.0251, 'ja': 0.0375, 'ka': 0.1383,
+             'kn': 0.1305, 'ko': 0.0222, 'ku': 0.0256, 'lo': 0.3488, 'lt': 0.0246, 'lv': 0.0264, 'ml': 0.1322,
+             'mr': 0.0571, 'ms': 0.0251, 'nl': 0.0342, 'no': 0.0266, 'or': 0.1269, 'pa': 0.1338, 'pl': 0.0275,
+             'pt': 0.0252, 'ro': 0.0247, 'ru': 0.0184, 'sk': 0.024, 'sl': 0.0253, 'sq': 0.0353, 'sr': 0.0234,
+             'sv': 0.033, 'ta': 0.1513, 'te': 0.1547, 'th': 0.0882, 'tl': 0.0368, 'tr': 0.0258, 'uk': 0.0206,
+             'ur': 0.0282, 'vi': 0.0467, 'yo': 0.0329, 'zh': 0.0152}
+
+""" Deprecated for now: Some languages score higher with the same amount of text, this multiplier evens it out
+ for multi-language strings
+self.scoreNormalizer = [0.7, 1, 1, 1, 1, 0.6, 0.98, 1, 1, 1, 0.9, 1, 1, 1, 1, 1, 1, 1, 0.6, 1, 0.7, 1, 1, 0.9,
+ 1, 1, 0.8, 0.6, 0.6, 1, 1, 0.5, 1, 1, 0.6, 0.7, 1, 0.95, 1, 0.6, 0.6, 1, 1, 1, 1, 1, 1, 0.9, 1, 1, 0.6, 0.6,
+ 0.7, 0.9, 1, 1, 1, 0.8, 1, 1.7]
+"""
diff --git a/eld/tests/test_detector.py b/eld/tests/test_detector.py
@@ -0,0 +1,82 @@
+import unittest
+import os
+import sys
+
+# Make sure, local package is imported instead of pip package
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+sys.path.insert(0, project_root)  # prioritize the local package
+# sys.path.append('../..')
+
+from eld import LanguageDetector
+from eld.languageDetector import get_clean_txt
+
+
+# Mostly functional testing, when functions are more mature I will add some more unit tests
+class TestDetector(unittest.TestCase):
+    def test_print_version(self):
+        detector = LanguageDetector()
+        print(detector.VERSION)
+
+    def test_load_eld(self):
+        detector = LanguageDetector()
+        self.assertIsInstance(detector, LanguageDetector)
+
+    def test_simple_detect(self):
+        detector = LanguageDetector()
+        result = detector.detect('Hola, cómo te llamas?').language
+        self.assertEqual(result, 'es')
+
+    def test_get_multiple_scores(self):
+        detector = LanguageDetector()
+        detector.return_scores = True
+        result = len(detector.detect('Hola, cómo te llamas?').scores())
+        message = "Expected: >1 scores"
+        self.assertGreater(result, 1, message)
+
+    def test_detect_error_empty_text(self):
+        detector = LanguageDetector()
+        result = detector.detect('').language
+        self.assertEqual(result, None)
+
+    def test_clean_text(self):
+        text = "https://www.google.com/\n" \
+               "mail@gmail.com\n" \
+               "google.com/search?q=search&source=hp\n" \
+               "12345 A12345\n"
+        result = get_clean_txt(text).strip()
+        self.assertEqual(result, '')
+
+    def test_check_confidence(self):
+        detector = LanguageDetector('ngramsM60')
+        text = 'zxz zcz zvz zbz znz zmz zlz zsz zdz zkz zjz pelo'
+        result = detector.detect(text).is_reliable()
+        self.assertEqual(result, False)
+
+    def test_load_ngrams_detect(self):
+        detector = LanguageDetector('ngramsM60-6_5ijqhj4oecs310zqtm8u9pgmd9ox2yd')
+        result = detector.detect('Hola, cómo te llamas?').language
+        self.assertEqual(result, 'es')
+
+    def test_accuracy_m_bigtest(self):
+        # TODO use importlib or pathlib to open txt file as package eld.tests.data resource
+        detector = LanguageDetector('ngramsM60')
+        file = open('data/big-test.txt', encoding="utf-8")  # '../../benchmark/big-test.txt'
+        content = file.read()
+        file.close()
+        lines = content.strip().split("\n")
+        total = 0
+        correct = 0
+        for line in lines:
+            total += 1
+            values = line.split("\t")
+            if detector.detect(values[1]).language == values[0]:
+                correct += 1
+        if total < 60000:
+            raise self.skipTest("big-test.txt was not load correctly, too few lines")
+        result = correct / total * 100
+        # a bit of margin, depending on tie scores order, avg. might change a bit
+        self.assertGreater(result, 99.4)
+
+
+if __name__ == '__main__':
+    unittest.main(verbosity=2)
diff --git a/eld/tests/test_subset.py b/eld/tests/test_subset.py
@@ -0,0 +1,70 @@
+import unittest
+import sys
+import os
+
+# Make sure, local package is imported instead of pip package
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+sys.path.insert(0, project_root)  # prioritize the local package
+# sys.path.append('../..')
+
+from eld import LanguageDetector
+
+
+# Mostly functional testing, when functions are more mature I will add some more unit tests
+class TestDetector(unittest.TestCase):
+    def test_load_eld(self):
+        detector = LanguageDetector()
+        self.assertIsInstance(detector, LanguageDetector)
+
+    def test_dynamic_subset_detect(self):
+        detector = LanguageDetector()
+        lang_subset = ['en']
+        detector.dynamic_lang_subset(lang_subset)
+        result = len(detector.detect('How are you? Bien, gracias').scores())
+        message = "Expected: 1 score, subset of only one language"
+        self.assertEqual(result, 1, message)
+
+    def test_remove_dynamic_subset(self):
+        detector = LanguageDetector()
+        lang_subset = ['en']
+        detector.dynamic_lang_subset(lang_subset)
+        detector.dynamic_lang_subset(None)
+        result = len(detector.detect('How are you? Bien, gracias').scores())
+        self.assertGreater(result, 1)
+
+    def test_subset_detect(self):
+        detector = LanguageDetector()
+        lang_subset = ['en']
+        detector.lang_subset(lang_subset)
+        result = len(detector.detect('How are you? Bien, gracias').scores())
+        message = "Expected: 1 score, subset of only one language"
+        self.assertEqual(result, 1, message)
+
+    def test_remove_subset(self):
+        detector = LanguageDetector()
+        lang_subset = ['en']
+        detector.lang_subset(lang_subset)
+        detector.lang_subset(None)
+        result = len(detector.detect('How are you? Bien, gracias').scores())
+        self.assertGreater(result, 1)
+
+    def test_save_subset_file(self):
+        # TODO use importlib or pathlib to check subset file as package resource
+        file = os.path.dirname(__file__) + '/../resources/ngrams/subset/ngramsM60-1_2rrx014rx6ypsas6tplo1gtcnmiv5mz.py'
+        if os.path.exists(file):
+            os.remove(file)
+        detector = LanguageDetector()
+        lang_subset = ['en']
+        detector.lang_subset(lang_subset)
+        result = os.path.exists(file)
+        message = "Subset languages file Not saved: " + file
+        self.assertEqual(result, True, message)
+
+    def test_load_ngrams_detect(self):
+        detector = LanguageDetector('ngramsM60-6_5ijqhj4oecs310zqtm8u9pgmd9ox2yd')
+        result = detector.detect('Hola, cómo te llamas?').language
+        self.assertEqual(result, 'es')
+
+
+if __name__ == '__main__':
+    unittest.main(verbosity=2)

-Original file line number
+Diff line change
@@ @@ -0,0 +1,7 @@ @@
 +__pycache__/
 +*.pyc
 +/.idea/
 +build/
 +dist/
 +*.egg-info/
 +*.egg
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .languageDetector import LanguageDetector`