Skip to content

Commit 4cf3d34

Browse files
author
Nito
committed
Rename
1 parent 442c718 commit 4cf3d34

File tree

1 file changed

+217
-0
lines changed

1 file changed

+217
-0
lines changed

src/LanguageDetector.php

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
<?php
2+
/*
3+
Copyright 2019 Nito T.M.
4+
Author URL: https://github.com/nitotm
5+
6+
Licensed under the Apache License, Version 2.0 (the "License");
7+
you may not use this file except in compliance with the License.
8+
You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
*/
18+
19+
// declare(strict_types = 1);
20+
21+
namespace Nitotm\Eld;
22+
23+
require_once __DIR__ . '/LanguageData.php';
24+
25+
class LanguageDetector extends LanguageData
26+
{
27+
public $returnScores = false;
28+
protected $wordStart;
29+
30+
31+
public function __construct(?string $ngramsFile = null)
32+
{
33+
parent::__construct($ngramsFile);
34+
$this->wordStart = [' '] + array_fill(1, 70, '');
35+
}
36+
37+
protected function tokenizer(string $str): array
38+
{
39+
return preg_split('/ /', $str, -1, PREG_SPLIT_NO_EMPTY);
40+
}
41+
42+
/**
43+
* Removes parts of a string, that may be considered as "noise" for language detection
44+
*/
45+
public function cleanTxt(string $str): string
46+
{
47+
// Remove URLS
48+
$str = preg_replace('@[hw]((ttps?://(www\.)?)|ww\.)([^\s/?.#-]+\.?)+(/\S*)?@i', ' ', $str);
49+
// Remove emails
50+
$str = preg_replace('/[a-zA-Z0-9.!$%&’+_`-]+@[A-Za-z0-9.-]+\.[A-Za-z0-9-]{2,64}/u', ' ', $str ?? '');
51+
// Remove .com domains
52+
$str = preg_replace('/([A-Za-z0-9-]+\.)+com(\/\S*|[^\pL])/u', ' ', $str ?? '');
53+
54+
// Remove alphanumerical/number codes
55+
return preg_replace('/[a-zA-Z]*\d+[a-zA-Z0-9]*+/', ' ', $str ?? '');
56+
}
57+
58+
/**
59+
* Converts scores index keys to standard ISO 639-1 code
60+
*/
61+
protected function isoScores(array $results): array
62+
{
63+
$scores = [];
64+
foreach ($results as $key => $score) {
65+
if ($score === 0) {
66+
break;
67+
}
68+
$scores[$this->langCodes[$key]] = $score;
69+
}
70+
71+
return $scores;
72+
}
73+
74+
/**
75+
* Gets Ngrams from a given string.
76+
*/
77+
protected function getByteNgrams(string $str): array
78+
{
79+
$str = mb_strtolower($str, 'UTF-8');
80+
$tokens = [];
81+
$countNgrams = 0;
82+
$start = $this->wordStart;
83+
84+
foreach ($this->tokenizer($str) as $word) {
85+
$len = strlen($word);
86+
if ($len > 70) {
87+
$len = 70;
88+
}
89+
90+
for ($j = 0; ($j + 4) < $len; $j += 3, ++$tmp, ++$countNgrams) {
91+
$tmp = &$tokens[$start[$j] . substr($word, $j, 4)];
92+
}
93+
$tmp = &$tokens[$start[$j] . substr($word, ($len !== 3 ? $len - 4 : 0)) . ' '];
94+
$tmp++;
95+
$countNgrams++;
96+
}
97+
98+
// Frequency is multiplied by 15000 at the ngrams database. A reduced number seems to work better.
99+
// Linear formulas were tried, decreasing the multiplier for fewer ngram strings, no meaningful improvement.
100+
foreach ($tokens as $bytes => $count) {
101+
$tokens[$bytes] = $count / $countNgrams * 13200;
102+
}
103+
104+
return $tokens;
105+
}
106+
107+
/**
108+
* Calculate scores for each language from the given Ngrams
109+
*/
110+
protected function calcScores(array $txtNgrams, int $numNgrams): array
111+
{
112+
$langScore = $this->langScore;
113+
$results = [];
114+
115+
foreach ($txtNgrams as $bytes => $frequency) {
116+
if (isset($this->ngrams[$bytes])) {
117+
$num_langs = count($this->ngrams[$bytes]);
118+
// Ngram score multiplier, the fewer languages found the more relevancy. Formula can be fine-tuned.
119+
if ($num_langs === 1) {
120+
$relevancy = 27;
121+
} elseif ($num_langs < 16) {
122+
$relevancy = (16 - $num_langs) / 2 + 1;
123+
} else {
124+
$relevancy = 1;
125+
}
126+
// Most time-consuming loop, do only the strictly necessary inside
127+
foreach ($this->ngrams[$bytes] as $lang => $ngramFrequency) {
128+
$langScore[$lang] += ($frequency > $ngramFrequency ? $ngramFrequency / $frequency
129+
: $frequency / $ngramFrequency) * $relevancy + 2;
130+
}
131+
}
132+
}
133+
// This divisor will produce a final score between 0 - ~1, score could be >1. Can be improved.
134+
$resultDivisor = $numNgrams * 3.2;
135+
// $scoreNormalizer = $this->scoreNormalizer; // local access improves speed
136+
foreach ($langScore as $lang => $score) {
137+
if ($score) {
138+
$results[$lang] = $score / $resultDivisor; // * $scoreNormalizer[$lang];
139+
}
140+
}
141+
142+
return $results;
143+
}
144+
145+
146+
/**
147+
* Returns the language detected for a given string, as an ISO 639-1 code or false
148+
* ['language' => 'en'];
149+
* ['language' => false, 'error' => 'Some error', 'scores'=>[]];
150+
* When returnScores = true;
151+
* ['language' => 'en', 'scores' => ['en' => 0.6, 'es' => 0.2]];
152+
*
153+
* @return (boolean|string|array)[]
154+
*/
155+
public function detect(
156+
string $text,
157+
bool $cleanText = false,
158+
bool $checkConfidence = false,
159+
int $minByteLength = 12,
160+
int $minNgrams = 3
161+
): array {
162+
// TODO return object, or not mutable return
163+
if ($cleanText) {
164+
// Removes Urls, emails, alphanumerical & numbers
165+
$text = $this->cleanTxt($text);
166+
}
167+
$minNgrams = ($minNgrams > 0 ? $minNgrams : 1); // faster than max()
168+
// Normalize special characters/word separators
169+
$text = trim(preg_replace('/[^\pL]+(?<![\x27\x60\x{2019}])/u', ' ', mb_substr($text, 0, 1000, 'UTF-8')));
170+
$thisLength = strlen($text);
171+
172+
if ($thisLength > 350) {
173+
// Cut to first whitespace after 350 byte length offset
174+
$text = substr($text, 0, min(380, (strpos($text, ' ', 350) ?: 350)));
175+
} elseif ($thisLength < $minByteLength) {
176+
return ['language' => false, 'error' => 'Text to short', 'scores' => []];
177+
}
178+
179+
$txtNgrams = $this->getByteNgrams($text);
180+
$numNgrams = count($txtNgrams);
181+
182+
if ($numNgrams >= $minNgrams) {
183+
$results = $this->calcScores($txtNgrams, $numNgrams);
184+
185+
if ($this->subset) {
186+
$results = $this->filterLangSubset($results);
187+
}
188+
arsort($results);
189+
190+
if ($results) {
191+
$top_lang = key($results);
192+
193+
if ($checkConfidence) {
194+
// A minimum of a 24% per ngram score from average
195+
if ($this->avgScore[$top_lang] * 0.24 > ($results[$top_lang] / $numNgrams)
196+
|| 0.01 > abs($results[$top_lang] - next($results))) {
197+
return [
198+
'language' => false,
199+
'error' => 'No language has been identified with sufficient confidence, set checkConfidence to false to avoid this error',
200+
'scores' => []
201+
];
202+
}
203+
}
204+
205+
if (!$this->returnScores) {
206+
return ['language' => $this->langCodes[$top_lang]];
207+
}
208+
209+
return ['language' => $this->langCodes[$top_lang], 'scores' => $this->isoScores($results)];
210+
}
211+
212+
return ['language' => false, 'error' => 'Language not detected', 'scores' => []];
213+
}
214+
215+
return ['language' => false, 'error' => 'Not enough distinct ngrams', 'scores' => []];
216+
}
217+
}

0 commit comments

Comments
 (0)