龙盟编程博客 | 无障碍搜索 | 云盘搜索神器
快速搜索
主页 > web编程 > php编程 >

最近采集写的一个超简单实用的HTML解析类

时间:2014-07-22 14:50来源: 作者: 点击:
分享到:
div $xp = new xf_HtmlDom(); /div div $xp-loadHtml(http://dealer.bitauto.com/100040078/cars.html); /div div $rows = $xp-find(dl/dd/a, 0)-innertext; /div div print_r($rows); /div
$xp = new xf_HtmlDom();
$xp->loadHtml('http://dealer.bitauto.com/100040078/cars.html');
$rows = $xp->find('dl/dd/a', 0)->innertext;
print_r($rows);

<?php
$oldSetting = libxml_use_internal_errors( true ); 
libxml_clear_errors();
/**
 * 
 * -+-----------------------------------
 * |PHP5 Framework - 2011
 * |Web Site: www.iblue.cc
 * |E-mail: mejinke@gmail.com
 * |Date: 2012-10-12
 * -+-----------------------------------
 * 
 * @desc HTML解析器
 * @author jingke
 */ 
class XF_HtmlDom
{
	private $_xpath = null;
	private $_nodePath = '';

	public function __construct($xpath = null, $nodePath = '')
	{
		$this->_xpath = $xpath;
		$this->_nodePath = $nodePath;
	}

	public function loadHtml($url)
	{
		ini_set('user_agent', 'Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17 –Nexus');
		$content = '';
		if(strpos(strtolower($url), 'http')===false)
		{
			$content = file_get_contents($url);
		}
		else
		{
			$ch = curl_init(); 
			$user_agent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";
			$user_agent1='Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0';
			curl_setopt($ch, CURLOPT_URL, $url); 
			curl_setopt($ch, CURLOPT_HEADER, false); 
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
			curl_setopt($ch, CURLOPT_REFERER, $url);
			curl_setopt($ch, CURLOPT_USERAGENT, $user_agent1);
			curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
			$content =curl_exec($ch); 
			curl_close($ch);
		}

		$html = new DOMDocument(); 
		$html->loadHtml($content); 
		$this->_xpath = new DOMXPath( $html ); 
		return $this;
		
	}

	public function find($query, $index = null)
	{
		if($this->_nodePath == '')
			$this->_nodePath = '//';
		else
			$this->_nodePath .= '/';
 
		$nodes = $this->_xpath->query($this->_nodePath.$query);
		if ($index == null && !is_numeric($index)) 
		{ 
			$tmp = array();
			foreach ($nodes as $node) 
			{
				$tmp[] = new XF_HtmlDom($this->_xpath, $node->getNodePath());
			}
			return $tmp;
		}
		return new XF_HtmlDom($this->_xpath,$this->_xpath->query($this->_nodePath.$query)->item($index)->getNodePath());
	}

	/**
	 * 获取内容
	 */
	public function text()
	{
		if ($this->_nodePath != '' && $this->_xpath != null ) 
			return $this->_xpath->query($this->_nodePath)->item(0)->textContent;
		else
			return false;
	}

	/**
	 * 获取属性值
	 */
	public function getAttribute($name)
	{
		if ($this->_nodePath != '' && $this->_xpath != null ) 
			return $this->_xpath->query($this->_nodePath)->item(0)->getAttribute($name);
		else
			return false;
	}
	
	public function __get($name)
	{
		if($name == 'innertext')
			return $this->text();
		else
			return $this->getAttribute($name);
	}

}
精彩图集

赞助商链接