Skip to content

Commit de0d471

Browse files
author
Nito
committed
Folders restructure
1 parent b54f877 commit de0d471

File tree

6 files changed

+260
-0
lines changed

6 files changed

+260
-0
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
__pycache__/
2+
*.pyc
3+
/.idea/
4+
build/
5+
dist/
6+
*.egg-info/
7+
*.egg

eld/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .languageDetector import LanguageDetector

eld/languageData.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
Copyright 2023 Nito T.M.
3+
Author URL: https://github.com/nitotm
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
"""
17+
18+
import importlib.util
19+
import os
20+
21+
22+
class LanguageData:
23+
def __init__(self):
24+
from .resources.avg_score import avg_score
25+
self.avg_score = avg_score
26+
self.ngrams = {}
27+
self.lang_score = []
28+
self.lang_codes = {}
29+
self.type = ''
30+
self.folder = os.path.dirname(__file__) + '/resources/ngrams/'
31+
32+
"""
33+
ISO 639-1 codes, for the 60 languages set.
34+
['am', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gu',
35+
'he', 'hi', 'hr', 'hu', 'hy', 'is', 'it', 'ja', 'ka', 'kn', 'ko', 'ku', 'lo', 'lt', 'lv', 'ml', 'mr', 'ms', 'nl',
36+
'no', 'or', 'pa', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur',
37+
'vi', 'yo', 'zh']
38+
39+
['Amharic', 'Arabic', 'Azerbaijani (Latin)', 'Belarusian', 'Bulgarian', 'Bengali', 'Catalan', 'Czech', 'Danish',
40+
'German', 'Greek', 'English', 'Spanish', 'Estonian', 'Basque', 'Persian', 'Finnish', 'French', 'Gujarati', 'Hebrew',
41+
'Hindi', 'Croatian', 'Hungarian', 'Armenian', 'Icelandic', 'Italian', 'Japanese', 'Georgian', 'Kannada', 'Korean',
42+
'Kurdish (Arabic)', 'Lao', 'Lithuanian', 'Latvian', 'Malayalam', 'Marathi', 'Malay (Latin)', 'Dutch', 'Norwegian',
43+
'Oriya', 'Punjabi', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Slovak', 'Slovene', 'Albanian',
44+
'Serbian (Cyrillic)', 'Swedish', 'Tamil', 'Telugu', 'Thai', 'Tagalog', 'Turkish', 'Ukrainian', 'Urdu', 'Vietnamese',
45+
'Yoruba', 'Chinese']
46+
"""
47+
48+
def load_ngrams(self, subset_file=''):
49+
if subset_file == '':
50+
from .resources.ngrams.ngramsM60 import ngrams_data
51+
else:
52+
# module = importlib.import_module('.ngrams.' + subset_file)
53+
file_path = self.folder + subset_file + '.py'
54+
if not os.path.exists(file_path):
55+
file_path = self.folder + 'subset/' + subset_file + '.py'
56+
spec = importlib.util.spec_from_file_location(subset_file, file_path)
57+
module = importlib.util.module_from_spec(spec)
58+
spec.loader.exec_module(module)
59+
ngrams_data = module.ngrams_data
60+
61+
self.ngrams = ngrams_data['ngrams']
62+
self.lang_score = [0] * (max(ngrams_data['languages'].keys()) + 1)
63+
self.type = ngrams_data['type']
64+
self.lang_codes = ngrams_data['languages']
65+
66+
67+
languageData = LanguageData()

eld/resources/avg_score.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""
2+
Copyright 2023 Nito T.M.
3+
Author URL: https://github.com/nitotm
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
"""
17+
18+
avg_score = {'am': 0.0661, 'ar': 0.0237, 'az': 0.0269, 'be': 0.0227, 'bg': 0.0234, 'bn': 0.1373, 'ca': 0.0246,
19+
'cs': 0.0242, 'da': 0.0277, 'de': 0.0275, 'el': 0.0369, 'en': 0.0378, 'es': 0.0252, 'et': 0.0253,
20+
'eu': 0.0369, 'fa': 0.0213, 'fi': 0.026, 'fr': 0.0253, 'gu': 0.1197, 'he': 0.0402, 'hi': 0.0578,
21+
'hr': 0.0201, 'hu': 0.0208, 'hy': 0.0439, 'is': 0.032, 'it': 0.0251, 'ja': 0.0375, 'ka': 0.1383,
22+
'kn': 0.1305, 'ko': 0.0222, 'ku': 0.0256, 'lo': 0.3488, 'lt': 0.0246, 'lv': 0.0264, 'ml': 0.1322,
23+
'mr': 0.0571, 'ms': 0.0251, 'nl': 0.0342, 'no': 0.0266, 'or': 0.1269, 'pa': 0.1338, 'pl': 0.0275,
24+
'pt': 0.0252, 'ro': 0.0247, 'ru': 0.0184, 'sk': 0.024, 'sl': 0.0253, 'sq': 0.0353, 'sr': 0.0234,
25+
'sv': 0.033, 'ta': 0.1513, 'te': 0.1547, 'th': 0.0882, 'tl': 0.0368, 'tr': 0.0258, 'uk': 0.0206,
26+
'ur': 0.0282, 'vi': 0.0467, 'yo': 0.0329, 'zh': 0.0152}
27+
28+
""" Deprecated for now: Some languages score higher with the same amount of text, this multiplier evens it out
29+
for multi-language strings
30+
self.scoreNormalizer = [0.7, 1, 1, 1, 1, 0.6, 0.98, 1, 1, 1, 0.9, 1, 1, 1, 1, 1, 1, 1, 0.6, 1, 0.7, 1, 1, 0.9,
31+
1, 1, 0.8, 0.6, 0.6, 1, 1, 0.5, 1, 1, 0.6, 0.7, 1, 0.95, 1, 0.6, 0.6, 1, 1, 1, 1, 1, 1, 0.9, 1, 1, 0.6, 0.6,
32+
0.7, 0.9, 1, 1, 1, 0.8, 1, 1.7]
33+
"""

eld/tests/test_detector.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import unittest
2+
import os
3+
import sys
4+
5+
# Make sure, local package is imported instead of pip package
6+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
7+
sys.path.insert(0, project_root) # prioritize the local package
8+
# sys.path.append('../..')
9+
10+
from eld import LanguageDetector
11+
from eld.languageDetector import get_clean_txt
12+
13+
14+
# Mostly functional testing, when functions are more mature I will add some more unit tests
15+
class TestDetector(unittest.TestCase):
16+
def test_print_version(self):
17+
detector = LanguageDetector()
18+
print(detector.VERSION)
19+
20+
def test_load_eld(self):
21+
detector = LanguageDetector()
22+
self.assertIsInstance(detector, LanguageDetector)
23+
24+
def test_simple_detect(self):
25+
detector = LanguageDetector()
26+
result = detector.detect('Hola, cómo te llamas?').language
27+
self.assertEqual(result, 'es')
28+
29+
def test_get_multiple_scores(self):
30+
detector = LanguageDetector()
31+
detector.return_scores = True
32+
result = len(detector.detect('Hola, cómo te llamas?').scores())
33+
message = "Expected: >1 scores"
34+
self.assertGreater(result, 1, message)
35+
36+
def test_detect_error_empty_text(self):
37+
detector = LanguageDetector()
38+
result = detector.detect('').language
39+
self.assertEqual(result, None)
40+
41+
def test_clean_text(self):
42+
text = "https://www.google.com/\n" \
43+
44+
"google.com/search?q=search&source=hp\n" \
45+
"12345 A12345\n"
46+
result = get_clean_txt(text).strip()
47+
self.assertEqual(result, '')
48+
49+
def test_check_confidence(self):
50+
detector = LanguageDetector('ngramsM60')
51+
text = 'zxz zcz zvz zbz znz zmz zlz zsz zdz zkz zjz pelo'
52+
result = detector.detect(text).is_reliable()
53+
self.assertEqual(result, False)
54+
55+
def test_load_ngrams_detect(self):
56+
detector = LanguageDetector('ngramsM60-6_5ijqhj4oecs310zqtm8u9pgmd9ox2yd')
57+
result = detector.detect('Hola, cómo te llamas?').language
58+
self.assertEqual(result, 'es')
59+
60+
def test_accuracy_m_bigtest(self):
61+
# TODO use importlib or pathlib to open txt file as package eld.tests.data resource
62+
detector = LanguageDetector('ngramsM60')
63+
file = open('data/big-test.txt', encoding="utf-8") # '../../benchmark/big-test.txt'
64+
content = file.read()
65+
file.close()
66+
lines = content.strip().split("\n")
67+
total = 0
68+
correct = 0
69+
for line in lines:
70+
total += 1
71+
values = line.split("\t")
72+
if detector.detect(values[1]).language == values[0]:
73+
correct += 1
74+
if total < 60000:
75+
raise self.skipTest("big-test.txt was not load correctly, too few lines")
76+
result = correct / total * 100
77+
# a bit of margin, depending on tie scores order, avg. might change a bit
78+
self.assertGreater(result, 99.4)
79+
80+
81+
if __name__ == '__main__':
82+
unittest.main(verbosity=2)

eld/tests/test_subset.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import unittest
2+
import sys
3+
import os
4+
5+
# Make sure, local package is imported instead of pip package
6+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
7+
sys.path.insert(0, project_root) # prioritize the local package
8+
# sys.path.append('../..')
9+
10+
from eld import LanguageDetector
11+
12+
13+
# Mostly functional testing, when functions are more mature I will add some more unit tests
14+
class TestDetector(unittest.TestCase):
15+
def test_load_eld(self):
16+
detector = LanguageDetector()
17+
self.assertIsInstance(detector, LanguageDetector)
18+
19+
def test_dynamic_subset_detect(self):
20+
detector = LanguageDetector()
21+
lang_subset = ['en']
22+
detector.dynamic_lang_subset(lang_subset)
23+
result = len(detector.detect('How are you? Bien, gracias').scores())
24+
message = "Expected: 1 score, subset of only one language"
25+
self.assertEqual(result, 1, message)
26+
27+
def test_remove_dynamic_subset(self):
28+
detector = LanguageDetector()
29+
lang_subset = ['en']
30+
detector.dynamic_lang_subset(lang_subset)
31+
detector.dynamic_lang_subset(None)
32+
result = len(detector.detect('How are you? Bien, gracias').scores())
33+
self.assertGreater(result, 1)
34+
35+
def test_subset_detect(self):
36+
detector = LanguageDetector()
37+
lang_subset = ['en']
38+
detector.lang_subset(lang_subset)
39+
result = len(detector.detect('How are you? Bien, gracias').scores())
40+
message = "Expected: 1 score, subset of only one language"
41+
self.assertEqual(result, 1, message)
42+
43+
def test_remove_subset(self):
44+
detector = LanguageDetector()
45+
lang_subset = ['en']
46+
detector.lang_subset(lang_subset)
47+
detector.lang_subset(None)
48+
result = len(detector.detect('How are you? Bien, gracias').scores())
49+
self.assertGreater(result, 1)
50+
51+
def test_save_subset_file(self):
52+
# TODO use importlib or pathlib to check subset file as package resource
53+
file = os.path.dirname(__file__) + '/../resources/ngrams/subset/ngramsM60-1_2rrx014rx6ypsas6tplo1gtcnmiv5mz.py'
54+
if os.path.exists(file):
55+
os.remove(file)
56+
detector = LanguageDetector()
57+
lang_subset = ['en']
58+
detector.lang_subset(lang_subset)
59+
result = os.path.exists(file)
60+
message = "Subset languages file Not saved: " + file
61+
self.assertEqual(result, True, message)
62+
63+
def test_load_ngrams_detect(self):
64+
detector = LanguageDetector('ngramsM60-6_5ijqhj4oecs310zqtm8u9pgmd9ox2yd')
65+
result = detector.detect('Hola, cómo te llamas?').language
66+
self.assertEqual(result, 'es')
67+
68+
69+
if __name__ == '__main__':
70+
unittest.main(verbosity=2)

0 commit comments

Comments
 (0)