Skip to content

Commit 95a759e

Browse files
authored
Merge pull request #972 from bact/add-corpus-tests
Add test_corpus to __init__
2 parents 0e8ee98 + 1666a00 commit 95a759e

File tree

3 files changed

+40
-46
lines changed

3 files changed

+40
-46
lines changed

pythainlp/corpus/core.py

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
Corpus related functions.
66
"""
7+
78
import json
89
import os
910
from typing import Union
@@ -25,16 +26,16 @@ def get_corpus_db(url: str):
2526

2627
corpus_db = None
2728
try:
28-
corpus_db = requests.get(url)
29+
corpus_db = requests.get(url, timeout=10)
2930
except requests.exceptions.HTTPError as http_err:
3031
print(f"HTTP error occurred: {http_err}")
31-
except Exception as err:
32+
except requests.exceptions.RequestException as err:
3233
print(f"Non-HTTP error occurred: {err}")
3334

3435
return corpus_db
3536

3637

37-
def get_corpus_db_detail(name: str, version: str = '') -> dict:
38+
def get_corpus_db_detail(name: str, version: str = "") -> dict:
3839
"""
3940
Get details about a corpus, using information from local catalog.
4041
@@ -176,7 +177,7 @@ def get_corpus_as_is(filename: str) -> list:
176177
return lines
177178

178179

179-
def get_corpus_default_db(name: str, version: str = '') -> Union[str, None]:
180+
def get_corpus_default_db(name: str, version: str = "") -> Union[str, None]:
180181
"""
181182
Get model path from default_db.json
182183
@@ -207,7 +208,7 @@ def get_corpus_default_db(name: str, version: str = '') -> Union[str, None]:
207208

208209

209210
def get_corpus_path(
210-
name: str, version: str = '', force: bool = False
211+
name: str, version: str = "", force: bool = False
211212
) -> Union[str, None]:
212213
"""
213214
Get corpus path.
@@ -251,11 +252,11 @@ def get_corpus_path(
251252
"""
252253
from typing import Dict
253254

254-
_CUSTOMIZE: Dict[str, str] = {
255+
CUSTOMIZE: Dict[str, str] = {
255256
# "the corpus name":"path"
256257
}
257-
if name in list(_CUSTOMIZE):
258-
return _CUSTOMIZE[name]
258+
if name in list(CUSTOMIZE):
259+
return CUSTOMIZE[name]
259260

260261
default_path = get_corpus_default_db(name=name, version=version)
261262
if default_path is not None:
@@ -290,14 +291,14 @@ def _download(url: str, dst: str) -> int:
290291
@param: URL for downloading file
291292
@param: dst place to put the file into
292293
"""
293-
_CHUNK_SIZE = 64 * 1024 # 64 KiB
294+
CHUNK_SIZE = 64 * 1024 # 64 KiB
294295

295296
from urllib.request import urlopen
296297

297298
import requests
298299

299300
file_size = int(urlopen(url).info().get("Content-Length", -1))
300-
r = requests.get(url, stream=True)
301+
r = requests.get(url, stream=True, timeout=10)
301302
with open(get_full_data_path(dst), "wb") as f:
302303
pbar = None
303304
try:
@@ -307,7 +308,7 @@ def _download(url: str, dst: str) -> int:
307308
except ImportError:
308309
pbar = None
309310

310-
for chunk in r.iter_content(chunk_size=_CHUNK_SIZE):
311+
for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
311312
if chunk:
312313
f.write(chunk)
313314
if pbar:
@@ -334,7 +335,7 @@ def _check_hash(dst: str, md5: str) -> None:
334335
file_md5 = hashlib.md5(content).hexdigest()
335336

336337
if md5 != file_md5:
337-
raise Exception("Hash does not match expected.")
338+
raise ValueError("Hash does not match expected.")
338339

339340

340341
def _version2int(v: str) -> int:
@@ -401,7 +402,7 @@ def _check_version(cause: str) -> bool:
401402

402403

403404
def download(
404-
name: str, force: bool = False, url: str = '', version: str = ''
405+
name: str, force: bool = False, url: str = "", version: str = ""
405406
) -> bool:
406407
"""
407408
Download corpus.
@@ -422,7 +423,7 @@ def download(
422423
423424
from pythainlp.corpus import download
424425
425-
download('wiki_lm_lstm', force=True)
426+
download("wiki_lm_lstm", force=True)
426427
# output:
427428
# Corpus: wiki_lm_lstm
428429
# - Downloading: wiki_lm_lstm 0.1
@@ -459,10 +460,13 @@ def download(
459460

460461
# version may still be None here
461462
if version not in corpus["versions"]:
462-
print("Not found corpus")
463+
print("Corpus not found.")
463464
return False
464-
elif _check_version(corpus["versions"][version]["pythainlp_version"]) is False:
465-
print("Versions Corpus not support")
465+
elif (
466+
_check_version(corpus["versions"][version]["pythainlp_version"])
467+
is False
468+
):
469+
print("Corpus version not supported.")
466470
return False
467471
corpus_versions = corpus["versions"][version]
468472
file_name = corpus_versions["filename"]
@@ -505,8 +509,10 @@ def download(
505509
foldername = name + "_" + str(version)
506510
if not os.path.exists(get_full_data_path(foldername)):
507511
os.mkdir(get_full_data_path(foldername))
508-
with zipfile.ZipFile(get_full_data_path(file_name), "r") as zip:
509-
zip.extractall(path=get_full_data_path(foldername))
512+
with zipfile.ZipFile(
513+
get_full_data_path(file_name), "r"
514+
) as zip_file:
515+
zip_file.extractall(path=get_full_data_path(foldername))
510516

511517
if found:
512518
local_db["_default"][found]["version"] = version
@@ -517,7 +523,9 @@ def download(
517523
# This awkward behavior is for backward-compatibility with
518524
# database files generated previously using TinyDB
519525
if local_db["_default"]:
520-
corpus_no = max((int(no) for no in local_db["_default"])) + 1
526+
corpus_no = (
527+
max((int(no) for no in local_db["_default"])) + 1
528+
)
521529
else:
522530
corpus_no = 1
523531
local_db["_default"][str(corpus_no)] = {
@@ -564,13 +572,13 @@ def remove(name: str) -> bool:
564572
565573
from pythainlp.corpus import remove, get_corpus_path, get_corpus
566574
567-
print(remove('ttc'))
575+
print(remove("ttc"))
568576
# output: True
569577
570-
print(get_corpus_path('ttc'))
578+
print(get_corpus_path("ttc"))
571579
# output: None
572580
573-
get_corpus('ttc')
581+
get_corpus("ttc")
574582
# output:
575583
# FileNotFoundError: [Errno 2] No such file or directory:
576584
# '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc'
@@ -580,7 +588,9 @@ def remove(name: str) -> bool:
580588
return False
581589
with open(corpus_db_path(), "r", encoding="utf-8-sig") as f:
582590
db = json.load(f)
583-
data = [corpus for corpus in db["_default"].values() if corpus["name"] == name]
591+
data = [
592+
corpus for corpus in db["_default"].values() if corpus["name"] == name
593+
]
584594

585595
if data:
586596
path = get_corpus_path(name)

tests/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
test_packages: list[str] = [
1414
"tests.test_ancient",
1515
"tests.test_cli",
16-
# "tests.test_corpus",
16+
"tests.test_corpus",
1717
"tests.test_morpheme",
1818
"tests.test_soundex",
1919
"tests.test_spell",

tests/test_corpus.py

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ def test_corpus(self):
7676
self.assertIsInstance(thai_female_names(), frozenset)
7777
self.assertIsInstance(thai_male_names(), frozenset)
7878

79-
self.assertIsNotNone(get_corpus_default_db("thainer", "1.5.1"))
80-
self.assertIsNotNone(get_corpus_default_db("thainer"))
81-
self.assertIsNone(get_corpus_default_db("thainer", "1.2"))
79+
# Tying not to download big files here, it slow down the test
80+
81+
self.assertIsNone(get_corpus_default_db("3XKg0013", "1.2.345"))
8282

8383
# BEGIN - Test non-exists
8484
self.assertIsInstance(
@@ -112,25 +112,9 @@ def test_corpus(self):
112112
self.assertFalse(remove("test")) # remove non-existing
113113
# END - Test download
114114

115-
# TODO: Need this clean up this "test" download test
116-
# BEGIN - Need to clean up this section
117-
self.assertFalse(download(name="test", version="0.0"))
118-
self.assertFalse(download(name="test", version="0.0.0"))
115+
# Corpus version does not support in this PyThaiNLP version
116+
# test 0.0.1 is for PyThaiNLP version <2.0
119117
self.assertFalse(download(name="test", version="0.0.1"))
120-
self.assertFalse(download(name="test", version="0.0.2"))
121-
self.assertFalse(download(name="test", version="0.0.3"))
122-
self.assertFalse(download(name="test", version="0.0.4"))
123-
self.assertIsNotNone(download(name="test", version="0.0.5"))
124-
self.assertTrue(download("test"))
125-
self.assertIsNotNone(remove("test")) # remove existing
126-
self.assertIsNotNone(download(name="test", version="0.0.6"))
127-
self.assertIsNotNone(download(name="test", version="0.0.7"))
128-
self.assertIsNotNone(download(name="test", version="0.0.8"))
129-
self.assertIsNotNone(download(name="test", version="0.0.9"))
130-
self.assertIsNotNone(download(name="test", version="0.0.10"))
131-
self.assertIsNotNone(download(name="test", version="0.1"))
132-
self.assertIsNotNone(remove("test"))
133-
# END - Need to clean up this section
134118

135119
def test_oscar(self):
136120
self.assertIsNotNone(oscar.word_freqs())

0 commit comments

Comments
 (0)