Rmm 分词算法代码片段
<?php function SplitRMM($str = "") { if ($str != "") $this->SetSource(trim($str)); if ($this->SourceString == "") return ""; //对文本进行粗分 $this->SourceString = $this->ReviseString($this->SourceString); //对特定文本进行分离 $spwords = explode(" ", $this->SourceString); $spLen = count($spwords); $spc = $this->SplitChar; for ($i = ($spLen - 1); $i >= 0; $i--) { if (trim($spwords[$i]) == "") continue; if ($this->NotGBK($spwords[$i])) { if (ereg("[^0-9.+-]", $spwords[$i])) { $this->ResultString = $spwords[$i] . $spc . $this->ResultString; } else { $nextword = ""; @$nextword = substr($this->ResultString, 0, strpos($this->ResultString, " ")); if (ereg("^" . $this->CommonUnit, $nextword)) { $this->ResultString = $spwords[$i] . $this->ResultString; } else { $this->ResultString = $spwords[$i] . $spc . $this->ResultString; } } } else { $c = $spwords[$i][0] . $spwords[$i][1]; $n = hexdec(bin2hex($c)); if ($c == "《") //书名 { $this->ResultString = $spwords[$i] . $spc . $this->ResultString; } else if ($n > 0xA13F && $n < 0xAA40) //标点符号 { $this->ResultString = $spwords[$i] . $spc . $this->ResultString; } else //正常短句 { if (strlen($spwords[$i]) <= $this->SplitLen) { //如果结束符为特殊分割词,分离处理 if (ereg($this->EspecialChar . "\$", $spwords[$i], $regs)) { $spwords[$i] = ereg_replace($regs[0] . "\$", "", $spwords[$i]) . $spc . $regs[0]; } //是否为常用单位 if (!ereg("^" . $this->CommonUnit, $spwords[$i]) || $i == 0) { $this->ResultString = $spwords[$i] . $spc . $this->ResultString; } else { $this->ResultString = $spwords[$i - 1] . $spwords[$i] . $spc . $this->ResultString; $i--; } } else { $this->ResultString = $this->RunRMM($spwords[$i]) . $spc . $this->ResultString; } } } } return $this->ResultString; }
本文地址:http://www.phprm.com/code/ddf7b919924cc013f8c5bf8ec661c94b.html
转载随意,但请附上文章地址:-)