PHP获取可以用GBK编码的汉字拼音首字母
使用时需满足: 汉字转码为GBK时在GBK编码中存在。 目前可支持20902个汉字未考虑多音字情形20131224:修正一处错误
使用时需满足:
汉字转码为GBK时在GBK编码中存在。
目前可支持20902个汉字
未考虑多音字情形
20131224:修正一处错误
汉字转码为GBK时在GBK编码中存在。
目前可支持20902个汉字
未考虑多音字情形
20131224:修正一处错误
<?php /** * 中文拼音 * 目前仅支持可以以GBK编码的20902个汉字提取拼音首字母 */ class Spell{ //当前字符 protected $word=''; //资源 protected $source = array(); protected $pos = array(); //运行期间数据 protected $temp = array(); //数据暂存标识//在运行期间多次使用时有明显效果 protected $flag = true; //输入输出字符集 protected $charset = array('in'=>'gbk','out'=>'utf-8'); //例外处理//当没有拼音首字母other===false的情况下,返回原字串否则设置为该值 protected $other = '!';//other=false; public function __construct($flag=true){ $this->flag = $flag; //加载资源 $this->source['gk221']=file_get_contents(DATA_PATH.'word/gk2-2-1.txt'); $this->source['gk31']=file_get_contents(DATA_PATH.'word/gk3-1.txt'); $this->source['gk41']=file_get_contents(DATA_PATH.'word/gk4-1.txt'); $this->pos=json_decode(file_get_contents(DATA_PATH.'word/pos.txt'),true); } /** * 获得拼音首字母 * @param string/arr $data 数据,可以是字符串和数组 * @param string $in 数据的编码 * @param string $out 输出的编码 * @return arr 返回数组 */ public function getInitial($data,$in='gbk',$out='utf-8'){ if(is_string($data)){ return self::getInitialByStr($data,$in,$out); }elseif(is_array($data)){ return self::getInitialByArr($data,$in,$out); } } /** * 获得拼音首字母 * @param string $data 字符串数据 * @param string $in 数据的编码 * @param string $out 输出的编码 * @return arr 返回数组 */ public function getInitialByStr($str,$in='gbk',$out='utf-8'){ $this->charset['in'] = strtolower($in); $this->charset['out'] = strtolower($out); if($this->flag!=true){ $this->temp['fws']= array(); } switch ($this->charset['in']) { case 'gbk': return self::_getInitialInGBK($str); break; case 'utf-8': return self::_getInitialInUTF8($str); break; default: # code... break; } //历史数据 if($this->flag!=true){ unset($this->temp['fws']); } } /** * 获得拼音首字母 * @param array $data 数组数据 * @param string $in 数据的编码 * @param string $out 输出的编码 * @return arr 返回数组 */ public function getInitialByArr($arr,$in='gbk',$out='utf-8'){ $this->charset['in'] = strtolower($in); $this->charset['out'] = strtolower($out); if($this->flag!=true){ $this->temp['fws']= array(); } switch ($this->charset['in']) { case 'gbk': return self::_getInitialInGBKArr($arr); break; case 'utf-8': return self::_getInitialInUTF8Arr($arr); break; default: # code... break; } //历史数据 if($this->flag!=true){ unset($this->temp['fws']); } } /** * 处理gbk编码字符串的首字母 * @param string $str 字符串 * @return array 数组 */ protected function _getInitialInGBK($str){ //存放字符串拼音 $w = array(); $i = 0; $str_length = strlen($str); //字符串的字节数 while ($i<$str_length){ //得到字符串中第$i位字符的ASCII码 $ascnum = ord($str[$i]); if($ascnum >= 0x81){//gbk区域 $nstr = substr($str, $i, 2); $i = $i + 2; }else{ $nstr = substr($str, $i, 1); $i = $i + 1; } $this->word = iconv('gbk','utf-8',$nstr); if(isset($this->temp['fws'][$nstr])){ $w[] = $this->temp['fws'][$nstr]; }else{ $w[] = self::_preGetInitial($nstr); } } return $w; } /** * 处理gbk编码数组的首字母 * @param array $arr 字符串单字数组 * @return array 数组 */ protected function _getInitialInGBKArr($arr){ //存放字符串拼音 $w = array(); foreach ($arr as $key => $word) { $this->word = iconv('gbk','utf-8',$word); if(isset($this->temp['fws'][$word])){ $w[] = $this->temp['fws'][$word]; }else{ $w[] = self::_preGetInitial($word); } } return $w; } /** * 处理utf-8编码字符串的首字母 * @param string $str 字符串 * @return array 数组 */ protected function _getInitialInUTF8($str){ //存放字符串拼音 $w = array(); $nstr = ''; $i = 0; $str_length = strlen($str); //字符串的字节数 while ($i<$str_length){ $ascnum = ord($str[$i]); //得到字符串中第$i位字符的ASCII码 if ( $ascnum >= 252){//如果ASCII位高与252 $nstr = substr($str, $i, 6); //根据UTF-8编码规范,将6个连续的字符计为单个字符 $i = $i + 6; //实际Byte计为6 }elseif ( $ascnum >= 248 ){//如果ASCII位高与248 $nstr = substr($str, $i, 5); //根据UTF-8编码规范,将5个连续的字符计为单个字符 $i = $i + 5; //实际Byte计为5 }elseif ( $ascnum >= 240 ){//如果ASCII位高与240 $nstr = substr($str, $i, 4); //根据UTF-8编码规范,将4个连续的字符计为单个字符 $i = $i + 4; //实际Byte计为4 }elseif ( $ascnum >= 224 ){//如果ASCII位高与224 $nstr = substr($str, $i, 3); //根据UTF-8编码规范,将3个连续的字符计为单个字符 $i = $i + 3 ; //实际Byte计为3 }elseif ( $ascnum >= 192 ){//如果ASCII位高与192 $nstr = substr($str, $i, 2); //根据UTF-8编码规范,将2个连续的字符计为单个字符 $i = $i + 2; //实际Byte计为2 }else{//其他情况下,包括大写字母,小写字母和半角标点符号,%,&,@,m,w等 $nstr = substr($str, $i, 1); $i = $i + 1; //实际的Byte数计1个 } $this->word = $nstr; //编码转换至GBK $nstr = iconv('utf-8','gbk',$nstr); if(isset($this->temp['fws'][$nstr])){ $w[] = $this->temp['fws'][$nstr]; }else{ $w[] = self::_preGetInitial($nstr); } } return $w; } /** * 处理utf-8编码数组的首字母 * @param array $arr 字符串单字数组 * @return array 数组 */ protected function _getInitialInUTF8Arr($arr){ //存放字符串拼音 $w = array(); foreach ($arr as $key => $word) { $this->word = $word; $nword = iconv('utf-8','GBK',$word); if(isset($this->temp['fws'][$nword])){ $w[] = $this->temp['fws'][$nword]; }else{ $w[] = self::_preGetInitial($nword); } } return $w; } /** * 对单字预处理 * @param string $word 单字,gbk编码 * @return string 拼音首字母,编码视$this->charset['out'] */ protected function _preGetInitial($word){ $fw = self::_getInitial($word);//返回的utf-8编码数据首字母 if($fw!==false){ $nstr=$this->temp['fws'][$word]=iconv('utf-8',$this->charset['out'],$fw); }else{ $nstr=$this->temp['fws'][$word]=iconv('gbk',$this->charset['out'],$word); } return $nstr; } /** * 获得汉字拼音首字母的核心函数 * @param string $word 单字,gbk编码 * @return string 首字母,utf-8编码 */ protected function _getInitial($word){ $high = ord($word{0}); $low = ord($word{1}); //对20902个汉字支持拼音首字母提取 $hexc = $high * 256 + $low; //GBK/2:gb2312汉字表(拼音排序),低位a0开始 if($hexc >= 0xB0A1 and $hexc <= 0xD7F9 and $low>=0xA0){ //共3755个字 return self::_getInGBK21($hexc); } //GBK/2:gb2312汉字表,低位a0开始 if($hexc >= 0xD8A1 and $hexc <= 0xF7FE and $low>=0xA0){ //共3008个字 return self::_getInGBK('gk221'); } //GBK/3:扩充汉字表(UCS 代码大小排列) if($hexc >= 0x8140 and $hexc <= 0xA0FE){ //共6080个字 return self::_getInGBK('gk31'); } //GBK/4:扩充汉字表(按康熙字典的页码/字位排列) //低位00开始 if($hexc >= 0xAA40 and $hexc <= 0xFEA0){ //共8059个字 return self::_getInGBK('gk41'); } //都没有 return $this->other; } /** * 获取首字母 * GBK/2:gb2312汉字表(拼音序) * 共3755个字 * @param int $hexc 单字GBK编码值 * @return string 首字母,utf-8编码 */ protected function _getInGBK21($hexc){ //无i,u,v开始的拼音 $char = array("",//填充位置 "A","B","C","D","E","F", "G","H","J","K","L","M", "N","O","P","Q","R","S", "T","W","X","Y","Z" ); $hcs = array( 0xB0A1,0xb0c5,0xb2c1,0xb4ee,0xb6ea,0xb7a2, 0xb8c1,0xb9fe,0xbbf7,0xbfa6,0xc0ac,0xc2e8, 0xc4c3,0xc5b6,0xc5be,0xc6da,0xc8bb,0xc8f6, 0xcbfa,0xcdda,0xcef4,0xd1b9,0xd4d1 ); if($key=array_search($hexc,$hcs)){ return $char[$key]; }else{ $hcs[] = $hexc; sort($hcs); return $char[array_search($hexc,$hcs)]; } } /** * 获取首字母 * @param string $type 单字所属GBK区域类型 * @return string 首字母,utf-8编码 */ protected function _getInGBK($type){ //无i,u,v开始的拼音 $char = array("",//填充位置 "A","B","C","D","E","F", "G","H","J","K","L","M", "N","O","P","Q","R","S", "T","W","X","Y","Z" ); $str = str_replace("\r\n",'',$this->source[$type]); $p = stripos($str,$this->word)+3;//居右//stripos($str,$word),居左 $str = ''; if($key=array_search($p,$this->pos[$type])){ return $char[$key]; }else{ $pos = $this->pos[$type]; $pos[] = $p; sort($pos); return $char[array_search($p,$pos)]; } } } ?>
2. [代码]使用示例 跳至 [1] [2] [全屏预览]
<?php define('DATA_PATH',__DIR__.'/'); $t1 = microtime(true); $py = new Spell(); $str =file_get_contents(DATA_PATH.'test/18.txt'); $r = $py->getInitial($str,'utf-8','utf-8'); $t2 = microtime(true); echo $t2.'<br>'; echo intval(($t2-$t1)*1000),'毫秒'; exit; define('DATA_PATH',__DIR__.'/'); $t1 = microtime(true); $py = new Spell(); $str =file_get_contents(DATA_PATH.'test/18-gbk.txt'); $r = $py->getInitial($str,'gbk','utf-8'); $t2 = microtime(true); echo $t2.'<br>'; echo intval(($t2-$t1)*1000),'毫秒'; ?>
- 上一篇:生成excel 列序号
- 下一篇:CI 批量插入数据
精彩图集
精彩文章