Skip to content

Commit e562b55

Browse files
author
Nito
committed
Code improvements with codesniffer
1 parent a2e1c7b commit e562b55

File tree

10 files changed

+202
-133
lines changed

10 files changed

+202
-133
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
/vendor/
2+
/.idea/
3+
/composer.lock
4+
/phpcpd.phar
5+
/phpcs.phar

benchmarks/bench.php

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<?php
22

3-
require_once __DIR__.'/../src/languageDetector.php';
3+
require_once __DIR__ . '/../src/languageDetector.php';
44

55
use Nitotm\Eld\LanguageDetector;
66

@@ -10,7 +10,6 @@
1010
print (PHP_SAPI === 'cli' ? '' : "<pre>" . PHP_EOL);
1111

1212
foreach ($files as $key => $file) {
13-
1413
$content = file_get_contents(__DIR__ . '/' . $file);
1514
$lines = explode("\n", trim($content));
1615
$texts = [];
@@ -25,30 +24,25 @@
2524

2625
$start = microtime(true);
2726
foreach ($texts as $text) {
28-
if ($eld->detect($text[0], false, false, 0, 1)['language'] == $text[1]) {
27+
if ($eld->detect($text[0], false, false, 0, 1)['language'] === $text[1]) {
2928
$correct++;
3029
}
3130
}
3231
$time = microtime(true) - $start;
33-
print $file . ' - Correct ratio: ' . round(($correct / $total) * 100, 2) . '% Time: ' . $time . PHP_EOL.PHP_EOL;
34-
32+
print $file . ' - Correct ratio: ' . round(($correct / $total) * 100, 2) . '% Time: ' . $time . PHP_EOL . PHP_EOL;
3533
}
3634

3735
print (PHP_SAPI === 'cli' ? '' : "</pre>" . PHP_EOL);
3836

3937
/*
4038
Results for v1.0.0, PHP 7.4.4, ngrams-m.php
4139
42-
tweets.txt - Correct ratio: 99.28% Time: 0.30713295936584
43-
40+
tweets.txt - Correct ratio: 99.28% Time: 0.30713295936584
4441
big-test.txt - Correct ratio: 99.42% Time: 2.4928371906281
45-
4642
sentences.txt - Correct ratio: 98.78% Time: 2.1568570137024
47-
4843
word-pairs.txt - Correct ratio: 87.56% Time: 0.66023302078247
49-
5044
single-words.txt - Correct ratio: 73.31% Time: 0.47791314125061
5145
5246
If correct ratio is inferior, use $eld = new languageDetector('ngrams-m.safe.php'); to see if it fixes the problem.
5347
54-
*/
48+
*/

composer.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,8 @@
3131
"require": {
3232
"php": "^7.3 || ^8.0",
3333
"ext-mbstring": "*"
34+
},
35+
"require-dev": {
36+
"squizlabs/php_codesniffer": "3.*"
3437
}
3538
}

demo.php

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,45 +23,58 @@
2323

2424
$eld = new Nitotm\Eld\LanguageDetector;
2525

26-
// detect() expects a UTF-8 string, and returns an array, with a value named 'language', which will be either an ISO 639-1 code or false
26+
// detect() expects a UTF-8 string, returns an array, with a value (ISO 639-1 code or false) named 'language'
2727
var_dump($eld->detect('Hola, cómo te llamas?'));
28-
// ['language' => 'es'];
29-
// ['language' => false, 'error' => 'Some error', 'scores'=>[]];
28+
// ['language' => 'es'];
29+
// ['language' => false, 'error' => 'Some error', 'scores'=>[]];
3030

3131

3232
// To get the best guess, turn off minimum length, confidence threshold; also used for benchmarking.
3333
var_dump($eld->detect('To', false, false, 0, 1));
3434

3535
/*
36-
To improve readability moving forward, PHP8 Named Parameters can be used
37-
print_r($eld->detect(text: 'To', cleanText: false, checkConfidence: false, minByteLength: 12, minNgrams: 3));
38-
cleanText: true, Removes Urls, domains, emails, alphanumerical & numbers
36+
To improve readability moving forward, PHP8 Named Parameters can be used
37+
print_r($eld->detect(text: 'To', cleanText: false, checkConfidence: false, minByteLength: 12, minNgrams: 3));
38+
cleanText: true, Removes Urls, domains, emails, alphanumerical & numbers
3939
*/
4040

4141
// To retrieve the whole list of languages detected and their score, we will set $returnScores to True, just once
4242
$eld->returnScores = true;
4343
var_dump($eld->detect('How are you? Bien, gracias'));
44-
// ['language' => 'en', 'scores' => ['en' => 0.32, 'es' => 0.31, ...]];
44+
// ['language' => 'en', 'scores' => ['en' => 0.32, 'es' => 0.31, ...]];
4545

4646
/*
47-
To reduce the languages to be detected, there are 3 different options, they only need to be executed once.
47+
To reduce the languages to be detected, there are 3 different options, they only need to be executed once.
4848
49-
This is the complete list on languages for ELD v1, using ISO 639-1 codes:
50-
['am', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gu', 'he', 'hi', 'hr', 'hu', 'hy', 'is', 'it', 'ja', 'ka', 'kn', 'ko', 'ku', 'lo', 'lt', 'lv', 'ml', 'mr', 'ms', 'nl', 'no', 'or', 'pa', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yo', 'zh']
49+
This is the complete list on languages for ELD v1, using ISO 639-1 codes:
50+
['am', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gu',
51+
'he', 'hi', 'hr', 'hu', 'hy', 'is', 'it', 'ja', 'ka', 'kn', 'ko', 'ku', 'lo', 'lt', 'lv', 'ml', 'mr', 'ms', 'nl',
52+
'no', 'or', 'pa', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur',
53+
'vi', 'yo', 'zh']
5154
*/
52-
$langSubset = ['en','es','fr','it','nl','de'];
55+
$langSubset = ['en', 'es', 'fr', 'it', 'nl', 'de'];
5356

5457
// dynamicLangSubset() Will execute the detector normally, but at the end will filter the excluded languages.
5558
$eld->dynamicLangSubset($langSubset);
5659
// to remove the subset
5760
$eld->dynamicLangSubset(false);
5861

59-
// langSubset($langs, save: true, safe: false) Will previously remove the excluded languages form the Ngrams database; for a single detection might be slower than dynamicLangSubset(), but for several strings will be faster. If $save option is true (default), the new ngrams subset will be stored, and next loaded for the same language subset, increasing startup speed. Use $safe=true to store Ngram bytes hex encoded.
62+
/*
63+
langSubset($langs, save: true, safe: false) Will previously remove the excluded languages form the Ngrams database;
64+
for a single detection might be slower than dynamicLangSubset(), but for several strings will be faster.
65+
If $save option is true, default, the new ngrams subset will be stored, and next loaded for the same language subset,
66+
increasing startup speed. Use $safe=true to store Ngram bytes hex encoded.
67+
*/
6068
$eld->langSubset($langSubset); // returns subset file name if saved
6169
// to remove the subset
6270
$eld->langSubset(false);
6371

64-
// Finally the fastest option to regularly use the same language subset, will be to add as an argument the file stored (and returned) by langSubset(), when creating an instance of the class. In this case the subset Ngrams database will be loaded directly, and not the default database. Also, you can use this option to load different ngram databases stored at src/ngrams/
72+
/*
73+
Finally the fastest option to regularly use the same language subset, will be to add as an argument the file stored
74+
(and returned) by langSubset(), when creating an instance of the class. In this case the subset Ngrams database will
75+
be loaded directly, and not the default database. Also, you can use this option to load different ngram databases
76+
stored at src/ngrams/
77+
*/
6578
$elds = new Nitotm\Eld\LanguageDetector('ngrams.2f37045c74780aba1d36d6717f3244dc025fb935.php');
6679

6780
print (PHP_SAPI === 'cli' ? '' : "</pre>");

src/LanguageData.php

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
See the License for the specific language governing permissions and
1616
limitations under the License.
1717
*/
18-
1918
// This file matches the trained data of the Ngrams database, for new trained databases this file has to be updated.
19+
declare(strict_types=1);
2020

2121
namespace Nitotm\Eld;
2222

@@ -26,7 +26,7 @@ class LanguageData extends LanguageSubset
2626
{
2727
protected $ngrams;
2828

29-
// ISO 639-1 codes
29+
// ISO 639-1 codes
3030
protected $langCodes
3131
= [
3232
'am', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr',
@@ -35,11 +35,22 @@ class LanguageData extends LanguageSubset
3535
'tr', 'uk', 'ur', 'vi', 'yo', 'zh'
3636
];
3737

38-
// ['Amharic', 'Arabic', 'Azerbaijani (Latin)', 'Belarusian', 'Bulgarian', 'Bengali', 'Catalan', 'Czech', 'Danish', 'German', 'Greek', 'English', 'Spanish', 'Estonian', 'Basque', 'Persian', 'Finnish', 'French', 'Gujarati', 'Hebrew', 'Hindi', 'Croatian', 'Hungarian', 'Armenian', 'Icelandic', 'Italian', 'Japanese', 'Georgian', 'Kannada', 'Korean', 'Kurdish (Arabic)', 'Lao', 'Lithuanian', 'Latvian', 'Malayalam', 'Marathi', 'Malay (Latin)', 'Dutch', 'Norwegian', 'Oriya', 'Punjabi', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Slovak', 'Slovene', 'Albanian', 'Serbian (Cyrillic)', 'Swedish', 'Tamil', 'Telugu', 'Thai', 'Tagalog', 'Turkish', 'Ukrainian', 'Urdu', 'Vietnamese', 'Yoruba', 'Chinese'];
39-
40-
// Deprecated for now. Some languages score higher with the same amount of text, this multiplier evens it out for multi-language strings
41-
//protected $scoreNormalizer = [0.7, 1, 1, 1, 1, 0.6, 0.98, 1, 1, 1, 0.9, 1, 1, 1, 1, 1, 1, 1, 0.6, 1, 0.7, 1, 1, 0.9, 1, 1, 0.8, 0.6, 0.6, 1, 1, 0.5, 1, 1, 0.6, 0.7, 1, 0.95, 1, 0.6, 0.6, 1, 1, 1, 1, 1, 1, 0.9, 1, 1, 0.6, 0.6, 0.7, 0.9, 1, 1, 1, 0.8, 1, 1.7];
38+
/*
39+
['Amharic', 'Arabic', 'Azerbaijani (Latin)', 'Belarusian', 'Bulgarian', 'Bengali', 'Catalan', 'Czech', 'Danish',
40+
'German', 'Greek', 'English', 'Spanish', 'Estonian', 'Basque', 'Persian', 'Finnish', 'French', 'Gujarati',
41+
'Hebrew', 'Hindi', 'Croatian', 'Hungarian', 'Armenian', 'Icelandic', 'Italian', 'Japanese', 'Georgian',
42+
'Kannada', 'Korean', 'Kurdish (Arabic)', 'Lao', 'Lithuanian', 'Latvian', 'Malayalam', 'Marathi', 'Malay (Latin)',
43+
'Dutch', 'Norwegian', 'Oriya', 'Punjabi', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Slovak', 'Slovene',
44+
'Albanian', 'Serbian (Cyrillic)', 'Swedish', 'Tamil', 'Telugu', 'Thai', 'Tagalog', 'Turkish', 'Ukrainian',
45+
'Urdu', 'Vietnamese', 'Yoruba', 'Chinese'];
46+
*/
4247

48+
/* Deprecated for now.
49+
Some languages score higher with the same amount of text, this multiplier evens it out for multi-language strings
50+
protected $scoreNormalizer = [0.7, 1, 1, 1, 1, 0.6, 0.98, 1, 1, 1, 0.9, 1, 1, 1, 1, 1, 1, 1, 0.6, 1, 0.7, 1, 1,
51+
0.9, 1, 1, 0.8, 0.6, 0.6, 1, 1, 0.5, 1, 1, 0.6, 0.7, 1, 0.95, 1, 0.6, 0.6, 1, 1, 1, 1, 1, 1, 0.9, 1, 1, 0.6, 0.6,
52+
0.7, 0.9, 1, 1, 1, 0.8, 1, 1.7];
53+
*/
4354
protected $langScore;
4455
protected $avgScore
4556
= [
@@ -50,10 +61,10 @@ class LanguageData extends LanguageSubset
5061
0.0882, 0.0368, 0.0258, 0.0206, 0.0282, 0.0467, 0.0329, 0.0152
5162
];
5263

53-
function __construct(string $ngramsFile = 'ngrams-m.php')
64+
public function __construct(?string $ngramsFile = null)
5465
{
5566
// Opcache needs to be active, so the load of the database array does not add overhead.
56-
require __DIR__ . '/ngrams/' . $ngramsFile;
67+
require __DIR__ . '/ngrams/' . ($ngramsFile ?? "ngrams-m.php");
5768
// Internal reference: _ngrams_newAddEnd4gramExtra_1-4_2824 + _ngrams_charUtf8_1-1_2291
5869
$this->langScore = array_fill(0, count($this->langCodes), 0);
5970
}

src/LanguageSubset.php

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,30 +15,32 @@
1515
See the License for the specific language governing permissions and
1616
limitations under the License.
1717
*/
18-
19-
/*
20-
To reduce the languages to be detected, there are 3 different options, they only need to be executed once.
21-
22-
The fastest option to regularly use the same language subset, will be to add as an argument the file stored (and returned) by langSubset(), when creating an instance of the languageDetector class. In this case the subset ngrams database will be loaded directly, and not the default database. Also, you can use this option to load different ngram databases.
23-
*/
18+
declare(strict_types=1);
2419

2520
namespace Nitotm\Eld;
2621

2722
class LanguageSubset
2823
{
2924
protected $subset = false;
3025
protected $loadedSubset = false;
26+
protected $ngrams = [];
27+
protected $langCodes = [];
3128
private $defaultNgrams = false;
3229

33-
// dynamicLangSubset() Will execute the detector normally, but at the end it will filter the excluded languages.
30+
/**
31+
* When active, detect() will filter the languages not included at $subset, from the scores, with filterLangSubset()
32+
*
33+
* @param array|bool $langs
34+
* @return array|false
35+
*/
3436
public function dynamicLangSubset($langs)
3537
{
3638
if ($langs) {
3739
$this->subset = [];
38-
foreach ($langs as $value) {
39-
$lang = array_search($value, $this->langCodes);
40-
if ($lang !== false) {
41-
$this->subset[] = $lang;
40+
foreach ($langs as $lang) {
41+
$foundLang = array_search($lang, $this->langCodes, true);
42+
if ($foundLang !== false) {
43+
$this->subset[] = $foundLang;
4244
}
4345
}
4446
sort($this->subset);
@@ -49,8 +51,15 @@ public function dynamicLangSubset($langs)
4951
return $this->subset;
5052
}
5153

52-
// langSubset($langs,$save=true) Will previously remove the excluded languages form the ngrams database; for a single detection might be slower than dynamicLangSubset(), but for multiple strings will be faster. if $save option is true (default), the new ngrams subset will be stored, and next loaded for the same language subset, increasing startup speed.
53-
public function langSubset($langs, $save = true, $safe = false)
54+
55+
/**
56+
* Removes the excluded languages form the ngrams database
57+
* if $save option is true, the new ngrams subset will be stored, and next loaded for the same language subset
58+
*
59+
* @param array|bool $langs
60+
* @return string|true
61+
*/
62+
public function langSubset($langs, bool $save = true, bool $safe = false)
5463
{
5564
if (!$langs) {
5665
if ($this->loadedSubset) {
@@ -88,7 +97,7 @@ public function langSubset($langs, $save = true, $safe = false)
8897

8998
foreach ($this->ngrams as $ngram => $langsID) {
9099
foreach ($langsID as $id => $value) {
91-
if (!in_array($id, $langs_array)) {
100+
if (!in_array($id, $langs_array, true)) {
92101
unset($this->ngrams[$ngram][$id]);
93102
}
94103
}
@@ -100,9 +109,10 @@ public function langSubset($langs, $save = true, $safe = false)
100109

101110
if ($save) {
102111
if (!file_exists($file_name)) { // in case $this->loadedSubset !== $new_subset, and was previously saved
103-
file_put_contents($file_name,
112+
file_put_contents(
113+
$file_name,
104114
'<?php' . "\r\n" . '// Do not edit unless you ensure you are using UTF-8 encoding' . "\r\n"
105-
. '$this->ngrams=' . $this->ngram_export($this->ngrams, $safe) . ';'
115+
. '$this->ngrams=' . $this->ngramExport($this->ngrams, $safe) . ';'
106116
);
107117
}
108118

@@ -112,31 +122,35 @@ public function langSubset($langs, $save = true, $safe = false)
112122
return true;
113123
}
114124

115-
protected function filterLangSubset($results)
125+
/**
126+
* Filters languages not included in the subset, from the results scores
127+
*/
128+
protected function filterLangSubset(array $results): array
116129
{
117-
foreach ($results as $key => $value) {
118-
if (!in_array($key, $this->subset)) {
119-
unset($results[$key]);
130+
foreach ($results as $langID => $score) {
131+
if (!in_array($langID, $this->subset, true)) {
132+
unset($results[$langID]);
120133
}
121134
}
122135

123136
return $results;
124137
}
125138

126-
protected function ngram_export($var, $safe = false)
139+
/**
140+
* @param array|int $data
141+
*/
142+
protected function ngramExport($data, bool $safe = false): ?string
127143
{
128-
if (is_array($var)) {
144+
if (is_array($data)) {
129145
$toImplode = array();
130-
foreach ($var as $key => $value) {
146+
foreach ($data as $key => $value) {
131147
$toImplode[] = ($safe === true ? '"\\x' . substr(chunk_split(bin2hex($key), 2, '\\x'), 0, -2) . '"'
132-
: var_export($key, true)) . '=>' . $this->ngram_export($value);
148+
: var_export($key, true)) . '=>' . $this->ngramExport($value);
133149
}
134150

135151
return '[' . implode(',', $toImplode) . ']';
136-
} else {
137-
return var_export($var, true);
138152
}
139-
}
140-
141153

154+
return var_export($data, true);
155+
}
142156
}

0 commit comments

Comments
 (0)