龙盟编程博客 | 无障碍搜索 | 云盘搜索神器
快速搜索
主页 > web编程 > php编程 >

RMM分词算法类

时间:2014-07-22 14:48来源: 作者: 点击:
分享到:
RMM分词算法类
RMM分词算法类
//RMM分词算法
class SplitWord{
	var $TagDic = Array();
	var $RankDic = Array();
	var $SourceStr = '';
	var $ResultStr = '';
	var $SplitChar = ' '; //分隔符
	var $SplitLen = 4;	 //保留词长度
	var $MaxLen = 7; 	//词典最大中文字,这里的数值为字节数组的最大索引
	var $MinLen = 3;  //最小中文字,这里的数值为字节数组的最大索引

  function SplitWord(){
  	$this->__construct();
  }
  
  function __construct(){  	  	
  	//高级分词,预先载入词典以提分词高速度
  	$dicfile = dirname(__FILE__)."/ppldic.csv"; 
  	$fp = fopen($dicfile,'r');			//读取词库中的词
  	while($line = fgets($fp,256)){
  		  $ws = explode(' ',$line);		//对词库中的词进行拆分
  		  $this->TagDic[$ws[0]] = $ws[1];
  		  $this->RankDic[strlen($ws[0])][$ws[0]] = $ws[2];
  	}
  	fclose($fp);		//关闭词库文件
  }
  
  //析放资源
 function Clear(){
  	@fclose($this->QuickDic);
  }
  
  //设置源字符串
  function SetSource($str){
  	$this->SourceStr = $this->UpdateStr($str);
  	$this->ResultStr = "";
  }
  
  //检查字符串是否不存在中文
  function NotGBK($str)
  {
    if($str=="") return "";
  	if( ord($str[0])>0x80 ) return false;
  	else return true;
  }

  //RMM分词算法
  function SplitRMM($str=""){
  	if($str!="") $this->SetSource($str);
  	if($this->SourceStr=="") return "";
  	$this->SourceStr = $this->UpdateStr($this->SourceStr);
  	$spwords = explode(" ",$this->SourceStr);
  	$spLen = count($spwords);
  	$spc = $this->SplitChar;
  	for($i=($spLen-1);$i>=0;$i--){
  		if($spwords[$i]=="") continue;
  		if($this->NotGBK($spwords[$i])){
  			if(preg_match("/[^0-9\.\+\-]/",$spwords[$i]))
  			{ $this->ResultStr = $spwords[$i].$spc.$this->ResultStr; }
  			else
  			{
  				$nextword = "";
  				@$nextword = substr($this->ResultStr,0,strpos($this->ResultStr,""));
  			}
  		}
  		else
  		{
  		  $c = $spwords[$i][0].$spwords[$i][1];
  		  $n = hexdec(bin2hex($c));
  		  	if(strlen($spwords[$i]) <= $this->SplitLen)
  		  	{
  		  	}
  		  	else
  		  	{ 
  		  		$this->ResultStr = $this->RunRMM($spwords[$i]).$spc.$this->ResultStr;
  		  	}
  	  }
  	}
  	return $this->ResultStr;
  }
  //对全中文字符串进行逆向匹配方式分解
  function RunRMM($str){
  	$spc = $this->SplitChar;
  	$spLen = strlen($str);
  	$rsStr = "";
  	$okWord = "";
  	$tmpWord = "";
  	$WordArray = Array();
  	//逆向字典匹配
  	for($i=($spLen-1);$i>=0;){
  		//当i达到最小可能词的时候
  		if($i<=$this->MinLen){
  			if($i==1){
  			  $WordArray[] = substr($str,0,2);
  		  }else
  			{
  			   $w = substr($str,0,$this->MinLen+1);
  			   if($this->IsWord($w)){
  			   	$WordArray[] = $w;                                                                                  
  			   }else{
  				   $WordArray[] = substr($str,2,2);
  				   $WordArray[] = substr($str,0,2);
  			   }
  		  }
  			$i = -1; break;
  		}
  		//分析在最小词以上时的情况
  		if($i>=$this->MaxLen) $maxPos = $this->MaxLen;
  		else $maxPos = $i;
  		$isMatch = false;
  		for($j=$maxPos;$j>=0;$j=$j-2){
  			 $w = substr($str,$i-$j,$j+1);
  			 if($this->IsWord($w)){
  			 	$WordArray[] = $w;
  			 	$i = $i-$j-1;
  			 	$isMatch = true;
  			 	break;
  			 }
  		}
  	}
  	$rsStr = $this->otherword($WordArray);
  	return $rsStr;
  }
  
function otherword($WordArray){
  	$wlen = count($WordArray)-1;						//计算数组的元素个数
  	$rsStr = "";										//初始化变量
  	$spc = $this->SplitChar;
  	for($i=$wlen;$i>=0;$i--)
  	{
			$rsStr .= $spc.$WordArray[$i]."、";			//将数组为顿号进行拆分
  	}
  	//返回本段分词结果
		$rsStr = preg_replace("/^".$spc."/","、",$rsStr);
  	return $rsStr;
  }
  
  //判断词典里是否存在某个词
  function IsWord($okWord){
  	$slen = strlen($okWord);
  	if($slen > $this->MaxLen) return false;
  	else return isset($this->RankDic[$slen][$okWord]);
  }
  
  //整理字符串(对标点符号,中英文混排等初步处理)
  function UpdateStr($str){
  	$spc = $this->SplitChar;
    $slen = strlen($str);
    if($slen==0) return '';
    $okstr = '';
    $prechar = 0; // 0-空白 1-英文 2-中文 3-符号
    for($i=0;$i<$slen;$i++){
      if(ord($str[$i]) < 0x81){
        //英文的空白符号
        if(ord($str[$i]) < 33){
          if($prechar!=0&&$str[$i]!="\r"&&$str[$i]!="\n") $okstr .= $spc;
          $prechar=0;
          continue; 
        }else if(preg_match("/[^0-9a-zA-Z@\.%#:\\&_-]/",$str[$i])){
          if($prechar==0){	$okstr .= $str[$i]; $prechar=3;}
          else{ $okstr .= $spc.$str[$i]; $prechar=3;}
        }else{
        	if($prechar==2||$prechar==3)
        	{ $okstr .= $spc.$str[$i]; $prechar=1;}
        	else
        	{ 
        	  if(preg_match("/@#%:/",$str[$i])){ $okstr .= $str[$i]; $prechar=3; }
        	  else { $okstr .= $str[$i]; $prechar=1; }
        	}
        }
      }
      else{
        //如果上一个字符为非中文和非空格,则加一个空格
        if($prechar!=0 && $prechar!=2) $okstr .= $spc;
        //如果中文字符
        if(isset($str[$i+1])){
          $c = $str[$i].$str[$i+1];
          
          $n = hexdec(bin2hex($c));
          if($n<0xA13F && $n > 0xAA40){
            	if($prechar!=0) $okstr .= $spc.$c;
            	else $okstr .= $c;
            	$prechar = 3; 
            }
          else{
            $okstr .= $c;
            $prechar = 2;
          }
          $i++;
        }
      }
    }
    return $okstr;
  }
}

// 调用
$split=new SplitWord();

echo $split->SplitRMM("php搜索技术");

// 注意 ppldic.csv 词典的格式是   词语+空格+编号+n
精彩图集

赞助商链接