龙盟编程博客 | 无障碍搜索 | 云盘搜索神器
快速搜索
主页 > web编程 > php编程 >

基于Snoopy的PHP近似完美获取网站编码

时间:2014-07-22 14:51来源: 作者: 点击:
分享到:
div div div 基于Snoopy的PHP近似完美获取网站编码 br / 用于php爬虫,获取编码准确率99.9%, 还有部分不能获取,求大牛完善 br / 代码来源: a href=http://www.siteyun.com target=_blank rel=nofollow站云网
基于Snoopy的PHP近似完美获取网站编码
用于php爬虫,获取编码准确率99.9%, 还有部分不能获取,求大牛完善
代码来源: 站云网 www.siteyun.com
先要到网上下载Snoopy.class.php
调用方法:<?php
require 'lib/Snoopy.class.php';
require 'lib/WebCrawl.class.php';//包含下面代码
$go=new WebCrawl('http://www.baidu.com');
echo $go->getCharset();
?>



<?php


class WebCrawl
{
	private $url;
	private $request;
	public $charset_arr=array(
		'gb2312',
		'utf-8',
		'big5',
		'gbk',
		'ascii',
		'cp936',
		'ibm037',
		'ibm437',
		'ibm500',
		'asmo-708',
		'dos-720',
		'ibm737',
		'ibm775',
		'ibm850',
		'ibm852',
		'ibm855',
		'ibm857',
		'ibm00858',
		'ibm861',
		'ibm860',
		'dos-862',
		'ibm863',
		'ibm864',
		'ibm865',
		'cp866',
		'ibm869',
		'ibm870',
		'windows-874',
		'cp875',
		'shift_jis',
		'ks_c_5601-1987',
		'ibm1026',
		'ibm01047',
		'ibm01047',
		'ibm01040',
		'ibm01041',
		'ibm01042',
		'ibm01043',
		'ibm01044',
		'ibm01045',
		'ibm01046',
		'ibm01047',
		'ibm01048',
		'ibm01049',
		'utf-16',
		'unicodefffe',
		'windows-1250',
		'windows-1251',
		'windows-1252',
		'windows-1253',
		'windows-1254',
		'windows-1255',
		'windows-1256',
		'windows-1257',
		'windows-1258',
		'johab',
		'macintosh',
		'x-mac-japanese',
		'x-mac-chinesetrad',
		'x-mac-korean',
		'x-mac-arabic',
		'x-mac-hebrew',
		'x-mac-greek',
		'x-mac-cyrillic',
		'x-mac-chinesesimp',
		'x-mac-romanian',
		'x-mac-ukrainian',
		'x-mac-thai',
		'x-mac-ce',
		'x-mac-icelandic',
		'x-mac-turkish',
		'x-mac-croatian',
		'x-chinese-cns',
		'x-cp20001',
		'x-chinese-eten',
		'x-cp20003',
		'x-cp20004',
		'x-cp20005',
		'x-ia5',
		'x-ia5-german',
		'x-ia5-swedish',
		'x-ia5-norwegian',
		'us-ascii',
		'x-cp20261',
		'x-cp20269',
		'ibm273',
		'ibm277',
		'ibm278',
		'ibm280',
		'ibm284',
		'ibm285',
		'ibm290',
		'ibm420',
		'ibm423',
		'ibm424',
		'x-ebcdic-koreanextended',
		'ibm-thai',
		'koi8-r',
		'ibm871',
		'ibm880',
		'ibm905',
		'ibm00924',
		'x-cp20936',
		'x-cp20949',
		'cp1025',
		'koi8-u',
		'iso-8859-1',
		'iso-8859-2',
		'iso-8859-3',
		'iso-8859-4',
		'iso-8859-5',
		'iso-8859-6',
		'iso-8859-7',
		'iso-8859-8',
		'iso-8859-9',
		'iso-8859-13',
		'iso-8859-15',
		'x-europa',
		'iso-8859-8-i',
		'iso-2022-jp',
		'csiso2022jp',
		'iso-2022-jp',
		'iso-2022-kr',
		'x-cp50227',
		'euc-jp',
		'euc-cn',
		'euc-kr',
		'hz-gb-2312',
		'gb18030',
		'x-iscii-de',
		'x-iscii-be',
		'x-iscii-ta',
		'x-iscii-te',
		'x-iscii-as',
		'x-iscii-or',
		'x-iscii-ka',
		'x-iscii-ma',
		'x-iscii-gu',
		'x-iscii-pa',
		'utf-7',
		'utf-32',
		'utf-32be'
	);
	
	public function __construct($url)
	{
		$this->url=$url;
	}
	
	//打开网站
	private function open($url)
	{
		if($this->request!==null)
		{
			if($this->request->status==200)
			{
				return true;
			}
			else 
			{
				return false;
			}
		}
		else 
		{
			$this->request=new Snoopy();
			$this->request->fetch($url);
			if($this->request->status==200)
			{
				$this->request->results=strtolower($this->request->results);
				$charset=$this->getCharset();
				if($charset!="utf-8")
				{
					if($charset=="windows-1252")
					{
						$this->request->results=$this->uni_decode($this->request->results);
					}
					else 
					{
						$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset);
					}
					
				}
				return true;
			}
			else 
			{
				return false;
			}
		}
	}
	
	
	//获取网站title,keywords,description
	public function getWebinfo()
	{
		$info=array(
			'title'=>'',
			'keywords'=>'',
			'desc'=>'',
			'ip'=>''
		);
		if(!$this->open($this->url)){return $info;exit;}
		
	//	print_r($this->request->results);exit;
		preg_match('/<title>([^>]*)<\/title>/si', $this->request->results, $titlematch );
		if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0)
		{
			$info['title'] = strip_tags($titlematch[1]);
		}
		
		preg_match_all('/<[\s]*meta[\s]*name="?' . '([^>"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match);
		$ft=0;
		foreach($match[1] as $mt)
		{
			if($mt=="keywords" || $mt=="description")
			{
				$ft=1;
			}
		}
		if($ft==0)
		{
			preg_match_all('/<[\s]*meta[\s]*content="?([^>"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match);
			if (isset($match) && is_array($match) && count($match) == 3)
			{
				$originals = $match[0];
				$names = $match[2];
				$values = $match[1];
				if (count($originals) == count($names) && count($names) == count($values))
				{
					$metaTags = array();
					for ($i=0, $limiti=count($names); $i < $limiti; $i++)
					{
						$metaTags[$names[$i]] = array (
							   'html' => htmlentities($originals[$i]),
							   'value' => $values[$i]
							   );
					}
				}
			}
		}
		else 
		{
			if (isset($match) && is_array($match) && count($match) == 3)
			{
				$originals = $match[0];
				$names = $match[1];
				$values = $match[2];
				if (count($originals) == count($names) && count($names) == count($values))
				{
					$metaTags = array();
					for ($i=0, $limiti=count($names); $i < $limiti; $i++)
					{
						$metaTags[$names[$i]] = array (
						   'html' => htmlentities($originals[$i]),
						   'value' => $values[$i]
						   );
					}
				}
			}
		}

		$result = array (
			'metaTags' => $metaTags
		);
		if(isset($result['metaTags']['keywords']['value']))
		{
			$info['keywords']=$result['metaTags']['keywords']['value'];
		}
		else
		{
			$info['keywords']="";
		}
		if(isset($result['metaTags']['description']['value']))
		{
			$info['desc']=$result['metaTags']['description']['value'];
		}
		else
		{
			$info['desc']="";
		}
		
		$domain=preg_replace('/http\:\/\//si', '', $this->url);
		$ip=@gethostbyname($domain);
		$ip_arr=explode(".", $ip);
	
		if(count($ip_arr)==4)
		{
			$info['ip']=$ip;
		}
		return $info;
	}
	
	public function t($string,$o)
	{
	    for($i=0;$i<strlen($string);$i++)  
	    {  
	        if(ord($string{$i})<128)  
	            continue;  
	 
	        if((ord($string{$i})&224)==224)  
	        {  
	            //第一个字节判断通过  
	            $char = $string{++$i};  
	            if((ord($char)&128)==128)  
	            {  
	                //第二个字节判断通过  
	                $char = $string{++$i};  
	                if((ord($char)&128)==128)  
	                {  
	                    $encoding = "UTF-8";  
	                    break;  
	                }  
	            }  
	        }  
	        if((ord($string{$i})&192)==192)  
	        {  
	            //第一个字节判断通过  
	            $char = $string{++$i};  
	            if((ord($char)&128)==128)  
	            {  
	                //第二个字节判断通过  
	                $encoding = "GB2312";  
	                break;  
	            }  
	        }  
	    } 
	    return strtolower($encoding);
	}
	
	
	function uni_decode ($str, $code = 'utf-8'){
	    $str = json_decode(preg_replace_callback('/&#(\d{5});/', create_function('$dec', 'return \'\\u\'.dechex($dec[1]);'), '"'.$str.'"'));
	    if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); }
	    return $str;
	}

	//获取网站编码
	public function getCharset()
	{
		if(!$this->open($this->url)){return false;exit;}
		//首先从html获取编码
		preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$this->request->results,$temp) ? strtolower($temp[1]):"";
		if($temp[1]!="")
		{
			if(in_array($temp[1], $this->charset_arr))
			{
				if($temp[1]=="gb2312")
				{
					$tmp_charset=$this->t($this->request->results,$temp[1]);
					if($tmp_charset==$temp[1])
					{
						return $temp[1]; 
					}
				}
				else 
				{
					return $temp[1];
				}
			}
		}
		
	
		if(!empty($this->request->headers))
		{
			//从header中获取编码
			$hstr=strtolower(implode("|||",$this->request->headers));
			preg_match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):"";
			if($lang[1]!="")
			{
				return $lang[1];
			}
		}
		
		$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win");
		$encoded=mb_detect_encoding($this->request->results,$encode_arr);
		if($encoded)
		{
			return strtolower($encoded);
		}
		else 
		{
			return false;
		}
	}

}
?>
精彩图集

赞助商链接