nitotm
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarks/bench.php‎
Lines changed: 5 additions & 11 deletions b/‎benchmarks/bench.php‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎composer.json‎
Lines changed: 3 additions & 0 deletions b/‎composer.json‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎demo.php‎
Lines changed: 26 additions & 13 deletions b/‎demo.php‎
Lines changed: 26 additions & 13 deletions
diff --git a/‎src/LanguageData.php‎
Lines changed: 19 additions & 8 deletions b/‎src/LanguageData.php‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎src/LanguageSubset.php‎
Lines changed: 42 additions & 28 deletions b/‎src/LanguageSubset.php‎
Lines changed: 42 additions & 28 deletions
@@ -0,0 +1,5 @@
+/vendor/
+/.idea/
+/composer.lock
+/phpcpd.phar
+/phpcs.phar
@@ -1,6 +1,6 @@
 <?php
 
-require_once __DIR__.'/../src/languageDetector.php';
+require_once __DIR__ . '/../src/languageDetector.php';
 
 use Nitotm\Eld\LanguageDetector;
 
@@ -10,7 +10,6 @@
 print (PHP_SAPI === 'cli' ? '' : "<pre>" . PHP_EOL);
 
 foreach ($files as $key => $file) {
-
     $content = file_get_contents(__DIR__ . '/' . $file);
     $lines = explode("\n", trim($content));
     $texts = [];
@@ -25,30 +24,25 @@
 
     $start = microtime(true);
     foreach ($texts as $text) {
-        if ($eld->detect($text[0], false, false, 0, 1)['language'] == $text[1]) {
+        if ($eld->detect($text[0], false, false, 0, 1)['language'] === $text[1]) {
             $correct++;
         }
     }
     $time = microtime(true) - $start;
-    print $file . ' - Correct ratio: ' . round(($correct / $total) * 100, 2) . '% Time: ' . $time . PHP_EOL.PHP_EOL;
-
+    print $file . ' - Correct ratio: ' . round(($correct / $total) * 100, 2) . '% Time: ' . $time . PHP_EOL . PHP_EOL;
 }
 
 print (PHP_SAPI === 'cli' ? '' : "</pre>" . PHP_EOL);
 
 /*
 Results for v1.0.0, PHP 7.4.4, ngrams-m.php
 
-tweets.txt - Correct ratio: 99.28% Time: 0.30713295936584
-
+tweets.txt - Correct ratio: 99.28% Time: 0.30713295936584
 big-test.txt - Correct ratio: 99.42% Time: 2.4928371906281
-
 sentences.txt - Correct ratio: 98.78% Time: 2.1568570137024
-
 word-pairs.txt - Correct ratio: 87.56% Time: 0.66023302078247
-
 single-words.txt - Correct ratio: 73.31% Time: 0.47791314125061
 
     If correct ratio is inferior, use $eld = new languageDetector('ngrams-m.safe.php'); to see if it fixes the problem.
 
-*/
+*/
@@ -31,5 +31,8 @@
   "require": {
     "php": "^7.3 || ^8.0",
     "ext-mbstring": "*"
+  },
+  "require-dev": {
+    "squizlabs/php_codesniffer": "3.*"
   }
 }
@@ -23,45 +23,58 @@
 
 $eld = new Nitotm\Eld\LanguageDetector;
 
-// detect() expects a UTF-8 string, and returns an array, with a value named 'language', which will be either an ISO 639-1 code or false
+// detect() expects a UTF-8 string, returns an array, with a value (ISO 639-1 code or false) named 'language'
 var_dump($eld->detect('Hola, cómo te llamas?'));
-	// ['language' => 'es'];
-	// ['language' => false, 'error' => 'Some error', 'scores'=>[]]; 
+// ['language' => 'es'];
+// ['language' => false, 'error' => 'Some error', 'scores'=>[]];
 
 
 // To get the best guess, turn off minimum length, confidence threshold; also used for benchmarking.
 var_dump($eld->detect('To', false, false, 0, 1));
 
 /*
-To improve readability moving forward, PHP8 Named Parameters can be used
-print_r($eld->detect(text: 'To', cleanText: false, checkConfidence: false, minByteLength: 12, minNgrams: 3));
-cleanText: true, Removes Urls, domains, emails, alphanumerical & numbers
+ To improve readability moving forward, PHP8 Named Parameters can be used
+ print_r($eld->detect(text: 'To', cleanText: false, checkConfidence: false, minByteLength: 12, minNgrams: 3));
+ cleanText: true, Removes Urls, domains, emails, alphanumerical & numbers
 */
 
 // To retrieve the whole list of languages detected and their score, we will set $returnScores to True, just once
 $eld->returnScores = true;
 var_dump($eld->detect('How are you? Bien, gracias'));
-	// ['language' => 'en', 'scores' => ['en' => 0.32, 'es' => 0.31, ...]];
+// ['language' => 'en', 'scores' => ['en' => 0.32, 'es' => 0.31, ...]];
 
 /*
-	To reduce the languages to be detected, there are 3 different options, they only need to be executed once.
+ To reduce the languages to be detected, there are 3 different options, they only need to be executed once.
 
-	This is the complete list on languages for ELD v1, using ISO 639-1 codes:
-	['am', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gu', 'he', 'hi', 'hr', 'hu', 'hy', 'is', 'it', 'ja', 'ka', 'kn', 'ko', 'ku', 'lo', 'lt', 'lv', 'ml', 'mr', 'ms', 'nl', 'no', 'or', 'pa', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yo', 'zh']
+ This is the complete list on languages for ELD v1, using ISO 639-1 codes:
+ ['am', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gu',
+ 'he', 'hi', 'hr', 'hu', 'hy', 'is', 'it', 'ja', 'ka', 'kn', 'ko', 'ku', 'lo', 'lt', 'lv', 'ml', 'mr', 'ms', 'nl',
+ 'no', 'or', 'pa', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur',
+ 'vi', 'yo', 'zh']
 */
-$langSubset = ['en','es','fr','it','nl','de'];
+$langSubset = ['en', 'es', 'fr', 'it', 'nl', 'de'];
 
 // dynamicLangSubset() Will execute the detector normally, but at the end will filter the excluded languages.
 $eld->dynamicLangSubset($langSubset);
 // to remove the subset
 $eld->dynamicLangSubset(false);
 
-// langSubset($langs, save: true, safe: false) Will previously remove the excluded languages form the Ngrams database; for a single detection might be slower than dynamicLangSubset(), but for several strings will be faster. If $save option is true (default), the new ngrams subset will be stored, and next loaded for the same language subset, increasing startup speed. Use $safe=true to store Ngram bytes hex encoded.
+/*
+ langSubset($langs, save: true, safe: false) Will previously remove the excluded languages form the Ngrams database;
+ for a single detection might be slower than dynamicLangSubset(), but for several strings will be faster.
+ If $save option is true, default, the new ngrams subset will be stored, and next loaded for the same language subset,
+ increasing startup speed. Use $safe=true to store Ngram bytes hex encoded.
+*/
 $eld->langSubset($langSubset); // returns subset file name if saved
 // to remove the subset
 $eld->langSubset(false);
 
-// Finally the fastest option to regularly use the same language subset, will be to add as an argument the file stored (and returned) by langSubset(), when creating an instance of the class. In this case the subset Ngrams database will be loaded directly, and not the default database. Also, you can use this option to load different ngram databases stored at src/ngrams/
+/*
+ Finally the fastest option to regularly use the same language subset, will be to add as an argument the file stored
+ (and returned) by langSubset(), when creating an instance of the class. In this case the subset Ngrams database will
+ be loaded directly, and not the default database. Also, you can use this option to load different ngram databases
+ stored at src/ngrams/
+ */
 $elds = new Nitotm\Eld\LanguageDetector('ngrams.2f37045c74780aba1d36d6717f3244dc025fb935.php');
 
 print (PHP_SAPI === 'cli' ? '' : "</pre>");
@@ -15,8 +15,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-
 // This file matches the trained data of the Ngrams database, for new trained databases this file has to be updated.
+declare(strict_types=1);
 
 namespace Nitotm\Eld;
 
@@ -26,7 +26,7 @@ class LanguageData extends LanguageSubset
 {
     protected $ngrams;
 
-    // ISO 639-1 codes 
+    // ISO 639-1 codes
     protected $langCodes
         = [
             'am', 'ar', 'az', 'be', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr',
@@ -35,11 +35,22 @@ class LanguageData extends LanguageSubset
             'tr', 'uk', 'ur', 'vi', 'yo', 'zh'
         ];
 
-    //  ['Amharic', 'Arabic', 'Azerbaijani (Latin)', 'Belarusian', 'Bulgarian', 'Bengali', 'Catalan', 'Czech', 'Danish', 'German', 'Greek', 'English', 'Spanish', 'Estonian', 'Basque', 'Persian', 'Finnish', 'French', 'Gujarati', 'Hebrew', 'Hindi', 'Croatian', 'Hungarian', 'Armenian', 'Icelandic', 'Italian', 'Japanese', 'Georgian', 'Kannada', 'Korean', 'Kurdish (Arabic)', 'Lao', 'Lithuanian', 'Latvian', 'Malayalam', 'Marathi', 'Malay (Latin)', 'Dutch', 'Norwegian', 'Oriya', 'Punjabi', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Slovak', 'Slovene', 'Albanian', 'Serbian (Cyrillic)', 'Swedish', 'Tamil', 'Telugu', 'Thai', 'Tagalog', 'Turkish', 'Ukrainian', 'Urdu', 'Vietnamese', 'Yoruba', 'Chinese'];
-
-    // Deprecated for now. Some languages score higher with the same amount of text, this multiplier evens it out for multi-language strings
-    //protected $scoreNormalizer = [0.7, 1, 1, 1, 1, 0.6, 0.98, 1, 1, 1, 0.9, 1, 1, 1, 1, 1, 1, 1, 0.6, 1, 0.7, 1, 1, 0.9, 1, 1, 0.8, 0.6, 0.6, 1, 1, 0.5, 1, 1, 0.6, 0.7, 1, 0.95, 1, 0.6, 0.6, 1, 1, 1, 1, 1, 1, 0.9, 1, 1, 0.6, 0.6, 0.7, 0.9, 1, 1, 1, 0.8, 1, 1.7];
+    /*
+      ['Amharic', 'Arabic', 'Azerbaijani (Latin)', 'Belarusian', 'Bulgarian', 'Bengali', 'Catalan', 'Czech', 'Danish',
+      'German', 'Greek', 'English', 'Spanish', 'Estonian', 'Basque', 'Persian', 'Finnish', 'French', 'Gujarati',
+      'Hebrew', 'Hindi', 'Croatian', 'Hungarian', 'Armenian', 'Icelandic', 'Italian', 'Japanese', 'Georgian',
+      'Kannada', 'Korean', 'Kurdish (Arabic)', 'Lao', 'Lithuanian', 'Latvian', 'Malayalam', 'Marathi', 'Malay (Latin)',
+      'Dutch', 'Norwegian', 'Oriya', 'Punjabi', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Slovak', 'Slovene',
+      'Albanian', 'Serbian (Cyrillic)', 'Swedish', 'Tamil', 'Telugu', 'Thai', 'Tagalog', 'Turkish', 'Ukrainian',
+      'Urdu', 'Vietnamese', 'Yoruba', 'Chinese'];
+     */
 
+    /* Deprecated for now.
+      Some languages score higher with the same amount of text, this multiplier evens it out for multi-language strings
+      protected $scoreNormalizer = [0.7, 1, 1, 1, 1, 0.6, 0.98, 1, 1, 1, 0.9, 1, 1, 1, 1, 1, 1, 1, 0.6, 1, 0.7, 1, 1,
+      0.9, 1, 1, 0.8, 0.6, 0.6, 1, 1, 0.5, 1, 1, 0.6, 0.7, 1, 0.95, 1, 0.6, 0.6, 1, 1, 1, 1, 1, 1, 0.9, 1, 1, 0.6, 0.6,
+      0.7, 0.9, 1, 1, 1, 0.8, 1, 1.7];
+     */
     protected $langScore;
     protected $avgScore
         = [
@@ -50,10 +61,10 @@ class LanguageData extends LanguageSubset
             0.0882, 0.0368, 0.0258, 0.0206, 0.0282, 0.0467, 0.0329, 0.0152
         ];
 
-    function __construct(string $ngramsFile = 'ngrams-m.php')
+    public function __construct(?string $ngramsFile = null)
     {
         // Opcache needs to be active, so the load of the database array does not add overhead.
-        require __DIR__ . '/ngrams/' . $ngramsFile;
+        require __DIR__ . '/ngrams/' . ($ngramsFile ?? "ngrams-m.php");
         // Internal reference: _ngrams_newAddEnd4gramExtra_1-4_2824 + _ngrams_charUtf8_1-1_2291
         $this->langScore = array_fill(0, count($this->langCodes), 0);
     }
 
@@ -15,30 +15,32 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-
-/* 
-To reduce the languages to be detected, there are 3 different options, they only need to be executed once.
-
-The fastest option to regularly use the same language subset, will be to add as an argument the file stored (and returned) by langSubset(), when creating an instance of the languageDetector class. In this case the subset ngrams database will be loaded directly, and not the default database. Also, you can use this option to load different ngram databases.
-*/
+declare(strict_types=1);
 
 namespace Nitotm\Eld;
 
 class LanguageSubset
 {
     protected $subset = false;
     protected $loadedSubset = false;
+    protected $ngrams = [];
+    protected $langCodes = [];
     private $defaultNgrams = false;
 
-    // dynamicLangSubset() Will execute the detector normally, but at the end it will filter the excluded languages.
+    /**
+     * When active, detect() will filter the languages not included at $subset, from the scores, with filterLangSubset()
+     *
+     * @param array|bool $langs
+     * @return array|false
+     */
     public function dynamicLangSubset($langs)
     {
         if ($langs) {
             $this->subset = [];
-            foreach ($langs as $value) {
-                $lang = array_search($value, $this->langCodes);
-                if ($lang !== false) {
-                    $this->subset[] = $lang;
+            foreach ($langs as $lang) {
+                $foundLang = array_search($lang, $this->langCodes, true);
+                if ($foundLang !== false) {
+                    $this->subset[] = $foundLang;
                 }
             }
             sort($this->subset);
@@ -49,8 +51,15 @@ public function dynamicLangSubset($langs)
         return $this->subset;
     }
 
-    // langSubset($langs,$save=true) Will previously remove the excluded languages form the ngrams database; for a single detection might be slower than dynamicLangSubset(), but for multiple strings will be faster. if $save option is true (default), the new ngrams subset will be stored, and next loaded for the same language subset, increasing startup speed.
-    public function langSubset($langs, $save = true, $safe = false)
+
+    /**
+     * Removes the excluded languages form the ngrams database
+     * if $save option is true, the new ngrams subset will be stored, and next loaded for the same language subset
+     *
+     * @param array|bool $langs
+     * @return string|true
+     */
+    public function langSubset($langs, bool $save = true, bool $safe = false)
     {
         if (!$langs) {
             if ($this->loadedSubset) {
@@ -88,7 +97,7 @@ public function langSubset($langs, $save = true, $safe = false)
 
             foreach ($this->ngrams as $ngram => $langsID) {
                 foreach ($langsID as $id => $value) {
-                    if (!in_array($id, $langs_array)) {
+                    if (!in_array($id, $langs_array, true)) {
                         unset($this->ngrams[$ngram][$id]);
                     }
                 }
@@ -100,9 +109,10 @@ public function langSubset($langs, $save = true, $safe = false)
 
         if ($save) {
             if (!file_exists($file_name)) { // in case $this->loadedSubset !== $new_subset, and was previously saved
-                file_put_contents($file_name,
+                file_put_contents(
+                    $file_name,
                     '<?php' . "\r\n" . '// Do not edit unless you ensure you are using UTF-8 encoding' . "\r\n"
-                    . '$this->ngrams=' . $this->ngram_export($this->ngrams, $safe) . ';'
+                    . '$this->ngrams=' . $this->ngramExport($this->ngrams, $safe) . ';'
                 );
             }
 
@@ -112,31 +122,35 @@ public function langSubset($langs, $save = true, $safe = false)
         return true;
     }
 
-    protected function filterLangSubset($results)
+    /**
+     * Filters languages not included in the subset, from the results scores
+     */
+    protected function filterLangSubset(array $results): array
     {
-        foreach ($results as $key => $value) {
-            if (!in_array($key, $this->subset)) {
-                unset($results[$key]);
+        foreach ($results as $langID => $score) {
+            if (!in_array($langID, $this->subset, true)) {
+                unset($results[$langID]);
             }
         }
 
         return $results;
     }
 
-    protected function ngram_export($var, $safe = false)
+    /**
+     * @param array|int $data
+     */
+    protected function ngramExport($data, bool $safe = false): ?string
     {
-        if (is_array($var)) {
+        if (is_array($data)) {
             $toImplode = array();
-            foreach ($var as $key => $value) {
+            foreach ($data as $key => $value) {
                 $toImplode[] = ($safe === true ? '"\\x' . substr(chunk_split(bin2hex($key), 2, '\\x'), 0, -2) . '"'
-                        : var_export($key, true)) . '=>' . $this->ngram_export($value);
+                        : var_export($key, true)) . '=>' . $this->ngramExport($value);
             }
 
             return '[' . implode(',', $toImplode) . ']';
-        } else {
-            return var_export($var, true);
         }
-    }
-
 
+        return var_export($data, true);
+    }
 }
Original file line number	Diff line number	Diff line change
`@@ -31,5 +31,8 @@`
`31`	`31`	`"require": {`
`32`	`32`	`"php": "^7.3 \|\| ^8.0",`
`33`	`33`	`"ext-mbstring": "*"`
	`34`	`+ },`
	`35`	`+ "require-dev": {`
	`36`	`+ "squizlabs/php_codesniffer": "3.*"`
`34`	`37`	`}`
`35`	`38`	`}`