1818
1919namespace Nitotm \Eld ;
2020
21- require_once __DIR__ . '/LanguageData.php ' ;
21+ require_once __DIR__ . '/LanguageData.php ' ;
2222
2323class LanguageDetector extends LanguageData
2424{
2525 public $ returnScores = false ;
26+ protected $ wordStart ;
27+
28+ public function __construct ()
29+ {
30+ parent ::__construct ();
31+ $ this ->wordStart = [' ' ] + array_fill (1 , 70 , '' );
32+ }
2633
2734 protected function tokenizer ($ str )
2835 {
@@ -57,13 +64,10 @@ protected function getScores($array)
5764
5865 protected function getByteNgrams ($ str )
5966 {
60- $ str = mb_strtolower ($ str , 'UTF-8 ' );
61- $ tokens = [];
67+ $ str = mb_strtolower ($ str , 'UTF-8 ' );
68+ $ tokens = [];
6269 $ countNgrams = 0 ;
63- // Word start. Local declaration improves speed. Much faster than ($j==0 ? ' ' : '')
64- $ start = [' ' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,
65- '' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,
66- '' ,'' ,'' ];
70+ $ start = $ this ->wordStart ;
6771
6872 foreach ($ this ->tokenizer ($ str ) as $ word ) {
6973 $ len = strlen ($ word );
@@ -72,9 +76,9 @@ protected function getByteNgrams($str)
7276 }
7377
7478 for ($ j = 0 ; ($ j + 4 ) < $ len ; $ j += 3 , ++$ tmp , ++$ countNgrams ) {
75- $ tmp = &$ tokens [$ start [$ j ]. substr ($ word , $ j , 4 )];
79+ $ tmp = &$ tokens [$ start [$ j ] . substr ($ word , $ j , 4 )];
7680 }
77- $ tmp = &$ tokens [$ start [$ j ]. substr ($ word , ($ len != 3 ? $ len - 4 : 0 )). ' ' ];
81+ $ tmp = &$ tokens [$ start [$ j ] . substr ($ word , ($ len != 3 ? $ len - 4 : 0 )) . ' ' ];
7882 $ tmp ++;
7983 $ countNgrams ++;
8084 }
@@ -91,7 +95,7 @@ protected function getByteNgrams($str)
9195 protected function calcScores ($ txtNgrams , $ numNgrams )
9296 {
9397 $ langScore = $ this ->langScore ;
94- $ results = [];
98+ $ results = [];
9599
96100 foreach ($ txtNgrams as $ bytes => $ frequency ) {
97101 if (isset ($ this ->ngrams [$ bytes ])) {
@@ -140,7 +144,7 @@ public function detect($text, $cleanText = false, $checkConfidence = false, $min
140144 }
141145 $ minNgrams = ($ minNgrams > 0 ? $ minNgrams : 1 );
142146 // Normalize special characters/word separators
143- $ text = trim (preg_replace ('/[^\pL]+(?<![\x27\x60\x{2019}])/u ' , ' ' , mb_substr ($ text , 0 , 1000 , 'UTF-8 ' )));
147+ $ text = trim (preg_replace ('/[^\pL]+(?<![\x27\x60\x{2019}])/u ' , ' ' , mb_substr ($ text , 0 , 1000 , 'UTF-8 ' )));
144148 $ thisLength = strlen ($ text );
145149
146150 if ($ thisLength > 350 ) {
@@ -170,13 +174,13 @@ public function detect($text, $cleanText = false, $checkConfidence = false, $min
170174 || 0.01 > abs ($ results [$ top_lang ] - next ($ results ))) {
171175 return [
172176 'language ' => false ,
173- 'error ' => 'No language has been identified with sufficient confidence, set checkConfidence to false to avoid this error ' ,
174- 'scores ' => []
177+ 'error ' => 'No language has been identified with sufficient confidence, set checkConfidence to false to avoid this error ' ,
178+ 'scores ' => []
175179 ];
176180 }
177181 }
178182
179- if ( ! $ this ->returnScores ) {
183+ if (! $ this ->returnScores ) {
180184 return ['language ' => $ this ->langCodes [$ top_lang ]];
181185 } else {
182186 return ['language ' => $ this ->langCodes [$ top_lang ], 'scores ' => $ this ->getScores ($ results )];
0 commit comments