Rmm 分词算法代码片段
<?php
function SplitRMM($str = "") {
if ($str != "") $this->SetSource(trim($str));
if ($this->SourceString == "") return "";
//对文本进行粗分
$this->SourceString = $this->ReviseString($this->SourceString);
//对特定文本进行分离
$spwords = explode(" ", $this->SourceString);
$spLen = count($spwords);
$spc = $this->SplitChar;
for ($i = ($spLen - 1); $i >= 0; $i--) {
if (trim($spwords[$i]) == "") continue;
if ($this->NotGBK($spwords[$i])) {
if (ereg("[^0-9.+-]", $spwords[$i])) {
$this->ResultString = $spwords[$i] . $spc . $this->ResultString;
} else {
$nextword = "";
@$nextword = substr($this->ResultString, 0, strpos($this->ResultString, " "));
if (ereg("^" . $this->CommonUnit, $nextword)) {
$this->ResultString = $spwords[$i] . $this->ResultString;
} else {
$this->ResultString = $spwords[$i] . $spc . $this->ResultString;
}
}
} else {
$c = $spwords[$i][0] . $spwords[$i][1];
$n = hexdec(bin2hex($c));
if ($c == "《") //书名
{
$this->ResultString = $spwords[$i] . $spc . $this->ResultString;
} else if ($n > 0xA13F && $n < 0xAA40) //标点符号
{
$this->ResultString = $spwords[$i] . $spc . $this->ResultString;
} else
//正常短句
{
if (strlen($spwords[$i]) <= $this->SplitLen) {
//如果结束符为特殊分割词,分离处理
if (ereg($this->EspecialChar . "\$", $spwords[$i], $regs)) {
$spwords[$i] = ereg_replace($regs[0] . "\$", "", $spwords[$i]) . $spc . $regs[0];
}
//是否为常用单位
if (!ereg("^" . $this->CommonUnit, $spwords[$i]) || $i == 0) {
$this->ResultString = $spwords[$i] . $spc . $this->ResultString;
} else {
$this->ResultString = $spwords[$i - 1] . $spwords[$i] . $spc . $this->ResultString;
$i--;
}
} else {
$this->ResultString = $this->RunRMM($spwords[$i]) . $spc . $this->ResultString;
}
}
}
}
return $this->ResultString;
}本文地址:http://www.phprm.com/code/ddf7b919924cc013f8c5bf8ec661c94b.html
转载随意,但请附上文章地址:-)