Skip to content

Commit a2e1c7b

Browse files
author
Nito
committed
Improved $start array definition. Code reformat.
1 parent 51ac98f commit a2e1c7b

File tree

1 file changed

+18
-14
lines changed

1 file changed

+18
-14
lines changed

src/languageDetector.php

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,18 @@
1818

1919
namespace Nitotm\Eld;
2020

21-
require_once __DIR__.'/LanguageData.php';
21+
require_once __DIR__ . '/LanguageData.php';
2222

2323
class LanguageDetector extends LanguageData
2424
{
2525
public $returnScores = false;
26+
protected $wordStart;
27+
28+
public function __construct()
29+
{
30+
parent::__construct();
31+
$this->wordStart = [' '] + array_fill(1, 70, '');
32+
}
2633

2734
protected function tokenizer($str)
2835
{
@@ -57,13 +64,10 @@ protected function getScores($array)
5764

5865
protected function getByteNgrams($str)
5966
{
60-
$str = mb_strtolower($str, 'UTF-8');
61-
$tokens = [];
67+
$str = mb_strtolower($str, 'UTF-8');
68+
$tokens = [];
6269
$countNgrams = 0;
63-
// Word start. Local declaration improves speed. Much faster than ($j==0 ? ' ' : '')
64-
$start = [' ','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',
65-
'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',
66-
'','',''];
70+
$start = $this->wordStart;
6771

6872
foreach ($this->tokenizer($str) as $word) {
6973
$len = strlen($word);
@@ -72,9 +76,9 @@ protected function getByteNgrams($str)
7276
}
7377

7478
for ($j = 0; ($j + 4) < $len; $j += 3, ++$tmp, ++$countNgrams) {
75-
$tmp = &$tokens[$start[$j].substr($word, $j, 4)];
79+
$tmp = &$tokens[$start[$j] . substr($word, $j, 4)];
7680
}
77-
$tmp = &$tokens[$start[$j].substr($word, ($len != 3 ? $len - 4 : 0)).' '];
81+
$tmp = &$tokens[$start[$j] . substr($word, ($len != 3 ? $len - 4 : 0)) . ' '];
7882
$tmp++;
7983
$countNgrams++;
8084
}
@@ -91,7 +95,7 @@ protected function getByteNgrams($str)
9195
protected function calcScores($txtNgrams, $numNgrams)
9296
{
9397
$langScore = $this->langScore;
94-
$results = [];
98+
$results = [];
9599

96100
foreach ($txtNgrams as $bytes => $frequency) {
97101
if (isset($this->ngrams[$bytes])) {
@@ -140,7 +144,7 @@ public function detect($text, $cleanText = false, $checkConfidence = false, $min
140144
}
141145
$minNgrams = ($minNgrams > 0 ? $minNgrams : 1);
142146
// Normalize special characters/word separators
143-
$text = trim(preg_replace('/[^\pL]+(?<![\x27\x60\x{2019}])/u', ' ', mb_substr($text, 0, 1000, 'UTF-8')));
147+
$text = trim(preg_replace('/[^\pL]+(?<![\x27\x60\x{2019}])/u', ' ', mb_substr($text, 0, 1000, 'UTF-8')));
144148
$thisLength = strlen($text);
145149

146150
if ($thisLength > 350) {
@@ -170,13 +174,13 @@ public function detect($text, $cleanText = false, $checkConfidence = false, $min
170174
|| 0.01 > abs($results[$top_lang] - next($results))) {
171175
return [
172176
'language' => false,
173-
'error' => 'No language has been identified with sufficient confidence, set checkConfidence to false to avoid this error',
174-
'scores' => []
177+
'error' => 'No language has been identified with sufficient confidence, set checkConfidence to false to avoid this error',
178+
'scores' => []
175179
];
176180
}
177181
}
178182

179-
if ( ! $this->returnScores) {
183+
if (!$this->returnScores) {
180184
return ['language' => $this->langCodes[$top_lang]];
181185
} else {
182186
return ['language' => $this->langCodes[$top_lang], 'scores' => $this->getScores($results)];

0 commit comments

Comments
 (0)