value(frequency, int)... key(ngram, string)
$ng_frequency = array_count_values($array_ngram);
//sort array by value(frequency) desc
arsort($ng_frequency);
//use only top frequent ngrams
$most_frequent = array_slice($ng_frequency, 0, $ng_number);
$sub_ng = array();
foreach ($most_frequent as $ng => $number_frequencey){
$sub_ng[] = $ng;
}
return $sub_ng;
}
/**
* Trouver la langue la/les langues plausibles en fonction des ngrams trouves dans le texte et des ngrams possibles
* @param array $sub_ng
* jeu de Ngrams trouves dans le texte
* @param array $lm_ng
* jeu de Ngrams possibles
* @param int $max_delta
* @return string
*/
function compareNGrams($sub_ng, $lm_ng, $max_delta = 140000){
foreach ($lm_ng as $lm_basename => $language){
$delta = 0;
//compare each ngram of input text to current lm-array
foreach ($sub_ng as $key => $existing_ngram){
//match
if (in_array($existing_ngram, $language)){
$delta += abs($key-array_search($existing_ngram, $language));
//no match
} else {
$delta += 400;
}
//abort: this language already differs too much
if ($delta>$max_delta){
break;
}
} // End comparison with current language
//include only non-aborted languages in result array
if ($delta<($max_delta-400)){
$result[$lm_basename] = $delta;
}
} //End comparison all languages
if (!isset($result)){
$result = '';
} else {
asort($result);
}
return $result;
}
/**
* Retourne la proportion de texte qui n'est pas dans la plage utf cherchee
* @param string $texte
* @param string $plage
* @return float
*/
function tester_plage_utf($texte, $plage){
$total = mb_strlen($texte, "UTF-8");
$test = mb_strlen(
preg_replace("/" . $plage . "/ui", "", $texte),
"UTF-8"
);
// echo "
".($test / $total)." ";
return ($test/$total);
}
/**
* Reduire le jeu de langue possibles en fonction des plages utf8 utilisees dans le texte
* @param string $texte
* @return array|bool|string
*/
function detecter_plages_utf($texte){
if (tester_plage_utf($texte, "[\x{0041}-\x{024F}\x{1E00}-\x{1EFF}]")<0.5){
// Latin
return array("fr", "en", "de", "it", "es", 'af', 'br', 'ca', 'ceb', "da", "fi", "nl", "nr", "pt", "pt_BR", "pt_PT", "sk", "ha", "haw", "hr", "pl", "cs", "az", "cy", "et", "ro", "eu", "hu", "id", "is", "la", "lt", "lv", "nb", "nso", "sl", "so", "sq", "ss", "st", "sv", "sw", "tl", "tlh", "tn", "tr", "ts", "ve", "xh", "zu");
} else if (tester_plage_utf($texte, "[\x{0400}-\x{0523}]")<0.5){
// Cyrillique
return array("ru", "bg", "kk", "uk", "ky", "mn", "uz", "mk", "sr");
} else if (tester_plage_utf($texte, "[\x{0370}-\x{03ff}\x{1F00}-\x{1FFE}]")<0.5){
// Grec/Copte
return "el";
} else if (tester_plage_utf($texte, "[\x{0530}-\x{058A}]")<0.5){
// Armenien
return "hy";
} else if (tester_plage_utf($texte, "[\x{0590}-\x{05F4}]")<0.5){
// Hebreux
return "he";
} else if (tester_plage_utf($texte, "[\x{0600}-\x{077F}]")<0.5){
// Arabe-farsi-pachtoune-urdu
return array("ar", "fa", "ps", "ur");
} else if (tester_plage_utf($texte, "[\x{3040}-\x{30FF}]")<0.7){
// Japonais (hiragana / katakana)
return "ja";
} else if (tester_plage_utf($texte, "[\x{4E00}-\x{9FBB}\x{4E00}–\x{9FBF}]")<0.7){
// Chinois
return "zh";
} else if (tester_plage_utf($texte, "[\x{0E00}-\x{0E5B}]")<0.5){
// Thai
return "th";
} else if (tester_plage_utf($texte, "[\x{1100}-\x{11F8}\x{3130}-\x{318E}\x{AC00}-\x{D7A3}]")<0.5){
// Hangul - Koreen
return "ko";
}
return false;
}
/**
* Detecter langue
*
* Premiere passe: detecter dans quel alphabet le texte est écrit
* ce qui permet de limiter le nombre de réponse
* (par exemple: des caractères «arabes» ne peuvent être que de l'arabe, du farsi, du pachtourne ou du urdu)
* Deuxieme passe: faire un test classique sur les trigrams (uniquement sur les langues possibles de la première passe)
*
* @param string $texte
* @return bool|string
*/
function _detecter_langue($texte){
$texte = strip_tags($texte);
$texte = str_replace("’", "'", $texte);
$texte = str_replace("\"", " ", $texte);
// Si les fonctions mb_ ne sont pas disponibles
// ou si texte trop court, impossible de détecter la langue
if ( !function_exists('mb_strlen')
OR !function_exists('mb_substr')
OR mb_strlen($texte, "utf-8")<6 ){
return false;
}
$possibles = detecter_plages_utf($texte);
if (!$possibles){
return false;
} else if (!is_array($possibles)){
return $possibles;
} else {
$ngrams = array();
foreach ($possibles as $lang){
$ngrams[$lang] = $GLOBALS["ngrams"][$lang];
}
}
$sub_ng = createNGrams($texte);
$result_array = compareNGrams($sub_ng, $ngrams, 140000);
// print_r($result_array);
$lang = false;
if (is_array($result_array)) {
foreach ($result_array as $lang => $score) {
if ($lang) {
break;
}
}
}
return $lang;
}