4
4
"""
5
5
Corpus related functions.
6
6
"""
7
+
7
8
import json
8
9
import os
9
10
from typing import Union
@@ -25,16 +26,16 @@ def get_corpus_db(url: str):
25
26
26
27
corpus_db = None
27
28
try :
28
- corpus_db = requests .get (url )
29
+ corpus_db = requests .get (url , timeout = 10 )
29
30
except requests .exceptions .HTTPError as http_err :
30
31
print (f"HTTP error occurred: { http_err } " )
31
- except Exception as err :
32
+ except requests . exceptions . RequestException as err :
32
33
print (f"Non-HTTP error occurred: { err } " )
33
34
34
35
return corpus_db
35
36
36
37
37
- def get_corpus_db_detail (name : str , version : str = '' ) -> dict :
38
+ def get_corpus_db_detail (name : str , version : str = "" ) -> dict :
38
39
"""
39
40
Get details about a corpus, using information from local catalog.
40
41
@@ -176,7 +177,7 @@ def get_corpus_as_is(filename: str) -> list:
176
177
return lines
177
178
178
179
179
- def get_corpus_default_db (name : str , version : str = '' ) -> Union [str , None ]:
180
+ def get_corpus_default_db (name : str , version : str = "" ) -> Union [str , None ]:
180
181
"""
181
182
Get model path from default_db.json
182
183
@@ -207,7 +208,7 @@ def get_corpus_default_db(name: str, version: str = '') -> Union[str, None]:
207
208
208
209
209
210
def get_corpus_path (
210
- name : str , version : str = '' , force : bool = False
211
+ name : str , version : str = "" , force : bool = False
211
212
) -> Union [str , None ]:
212
213
"""
213
214
Get corpus path.
@@ -251,11 +252,11 @@ def get_corpus_path(
251
252
"""
252
253
from typing import Dict
253
254
254
- _CUSTOMIZE : Dict [str , str ] = {
255
+ CUSTOMIZE : Dict [str , str ] = {
255
256
# "the corpus name":"path"
256
257
}
257
- if name in list (_CUSTOMIZE ):
258
- return _CUSTOMIZE [name ]
258
+ if name in list (CUSTOMIZE ):
259
+ return CUSTOMIZE [name ]
259
260
260
261
default_path = get_corpus_default_db (name = name , version = version )
261
262
if default_path is not None :
@@ -290,14 +291,14 @@ def _download(url: str, dst: str) -> int:
290
291
@param: URL for downloading file
291
292
@param: dst place to put the file into
292
293
"""
293
- _CHUNK_SIZE = 64 * 1024 # 64 KiB
294
+ CHUNK_SIZE = 64 * 1024 # 64 KiB
294
295
295
296
from urllib .request import urlopen
296
297
297
298
import requests
298
299
299
300
file_size = int (urlopen (url ).info ().get ("Content-Length" , - 1 ))
300
- r = requests .get (url , stream = True )
301
+ r = requests .get (url , stream = True , timeout = 10 )
301
302
with open (get_full_data_path (dst ), "wb" ) as f :
302
303
pbar = None
303
304
try :
@@ -307,7 +308,7 @@ def _download(url: str, dst: str) -> int:
307
308
except ImportError :
308
309
pbar = None
309
310
310
- for chunk in r .iter_content (chunk_size = _CHUNK_SIZE ):
311
+ for chunk in r .iter_content (chunk_size = CHUNK_SIZE ):
311
312
if chunk :
312
313
f .write (chunk )
313
314
if pbar :
@@ -334,7 +335,7 @@ def _check_hash(dst: str, md5: str) -> None:
334
335
file_md5 = hashlib .md5 (content ).hexdigest ()
335
336
336
337
if md5 != file_md5 :
337
- raise Exception ("Hash does not match expected." )
338
+ raise ValueError ("Hash does not match expected." )
338
339
339
340
340
341
def _version2int (v : str ) -> int :
@@ -401,7 +402,7 @@ def _check_version(cause: str) -> bool:
401
402
402
403
403
404
def download (
404
- name : str , force : bool = False , url : str = '' , version : str = ''
405
+ name : str , force : bool = False , url : str = "" , version : str = ""
405
406
) -> bool :
406
407
"""
407
408
Download corpus.
@@ -422,7 +423,7 @@ def download(
422
423
423
424
from pythainlp.corpus import download
424
425
425
- download(' wiki_lm_lstm' , force=True)
426
+ download(" wiki_lm_lstm" , force=True)
426
427
# output:
427
428
# Corpus: wiki_lm_lstm
428
429
# - Downloading: wiki_lm_lstm 0.1
@@ -459,10 +460,13 @@ def download(
459
460
460
461
# version may still be None here
461
462
if version not in corpus ["versions" ]:
462
- print ("Not found corpus " )
463
+ print ("Corpus not found. " )
463
464
return False
464
- elif _check_version (corpus ["versions" ][version ]["pythainlp_version" ]) is False :
465
- print ("Versions Corpus not support" )
465
+ elif (
466
+ _check_version (corpus ["versions" ][version ]["pythainlp_version" ])
467
+ is False
468
+ ):
469
+ print ("Corpus version not supported." )
466
470
return False
467
471
corpus_versions = corpus ["versions" ][version ]
468
472
file_name = corpus_versions ["filename" ]
@@ -505,8 +509,10 @@ def download(
505
509
foldername = name + "_" + str (version )
506
510
if not os .path .exists (get_full_data_path (foldername )):
507
511
os .mkdir (get_full_data_path (foldername ))
508
- with zipfile .ZipFile (get_full_data_path (file_name ), "r" ) as zip :
509
- zip .extractall (path = get_full_data_path (foldername ))
512
+ with zipfile .ZipFile (
513
+ get_full_data_path (file_name ), "r"
514
+ ) as zip_file :
515
+ zip_file .extractall (path = get_full_data_path (foldername ))
510
516
511
517
if found :
512
518
local_db ["_default" ][found ]["version" ] = version
@@ -517,7 +523,9 @@ def download(
517
523
# This awkward behavior is for backward-compatibility with
518
524
# database files generated previously using TinyDB
519
525
if local_db ["_default" ]:
520
- corpus_no = max ((int (no ) for no in local_db ["_default" ])) + 1
526
+ corpus_no = (
527
+ max ((int (no ) for no in local_db ["_default" ])) + 1
528
+ )
521
529
else :
522
530
corpus_no = 1
523
531
local_db ["_default" ][str (corpus_no )] = {
@@ -564,13 +572,13 @@ def remove(name: str) -> bool:
564
572
565
573
from pythainlp.corpus import remove, get_corpus_path, get_corpus
566
574
567
- print(remove(' ttc' ))
575
+ print(remove(" ttc" ))
568
576
# output: True
569
577
570
- print(get_corpus_path(' ttc' ))
578
+ print(get_corpus_path(" ttc" ))
571
579
# output: None
572
580
573
- get_corpus(' ttc' )
581
+ get_corpus(" ttc" )
574
582
# output:
575
583
# FileNotFoundError: [Errno 2] No such file or directory:
576
584
# '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc'
@@ -580,7 +588,9 @@ def remove(name: str) -> bool:
580
588
return False
581
589
with open (corpus_db_path (), "r" , encoding = "utf-8-sig" ) as f :
582
590
db = json .load (f )
583
- data = [corpus for corpus in db ["_default" ].values () if corpus ["name" ] == name ]
591
+ data = [
592
+ corpus for corpus in db ["_default" ].values () if corpus ["name" ] == name
593
+ ]
584
594
585
595
if data :
586
596
path = get_corpus_path (name )
0 commit comments