利用curl，正则表达式做的一个php蜘蛛抓取器

时间:2014-07-22 14:51来源: 作者: 点击: 次

分享到：

凤网fcms内容管理系统br / get.php 抓取框架，对网页内容的分析处理并进行相关替换br / std.php 通用正则br / news_67_com.php 对http://news.67.com 的抓取分析器br / 先抓列表，再抓内容页。br / 还欠缺

凤网fcms内容管理系统
get.php 抓取框架，对网页内容的分析处理并进行相关替换
std.php 通用正则
news_67_com.php 对http://news.67.com 的抓取分析器
先抓列表，再抓内容页。
还欠缺监控，统计，错误处理功能。个人觉得还是比较好玩。

<?php
include_once dirname(__FILE__) . '/std.php';

$site = array(
    'aname'     => '中国娱乐网',
    'domain'    => 'news.67.com',
    'dirname'   => '目录名称，用于匹配基于目录不同的正文',
    'gettype'   => 'default',
	//获取主文件
    'creg'      => '/(?si)<!--文章 begin-->(.*?)\<\!--文章 end-->/',
    'code'      => 'utf-8',
    'sub'       => '获取子目录正则',
    'content'   => 'tag1',
	'img_upload'=> array('tag1' => ''),
	//下一页
    'reg_next'  => '/(?is)<a target=\'_self\' href=\'([^\']*?)\'>下一页\&gt;\&gt;<\/a>/',
    'key0'      => '/(?is)<meta name="keywords" content="([^"]*?)".*?\/>/',
    'key0_ap'   => array(array(',', '|'), ' '),
    'tag0'      => '/(?is)<h1>([^<^>]*?)<\/h1>/',
	'tag0_arp'	=> array(
		array(
			'/(?is)\(组图\)/',
			'/(?is)\(图\)/',
			'/(?is)\(图\.\./',
			'/(?is)\(组图\.\./',
			'/(?is)\./',
			'/(?is)(《|》)/',
		),
		array(
			'', '', '', '', '', '',
		)
	),
    'tag1'      => '/(?is)<div class="article" id="divContent">(.*?)<img class="[^"]*?" style="[^"]*?" src="[^"]*?" alt="[^"]*?" border="\d*" \/>/',
	'tag1_brp'	=> array(
		array(
			'/(?is)（.*?）/',
			'/(?is)\(.*?\)/',
			'/(?is)\s*<p align="center">.*?<img.*?src="([^"]*?)".*?>(.*?)<\/p>\s*/',
			'/(?is)\s*<p>\s*/',
			'/(?is)\s*<p align="center">\s*/',
			'/(?is)　/',
			'/(?is)<br \/>/',
			'/(?is)\s*<p align="left">\s*/',
			'/(?is)\s*<p class="f_center" align="center">\s*/',
			'/(?is)\s*<center>\s*/',
			'/(?is)\s*<\/center>\s*/',
			'/(?is)\s*<p class="f_center">\s*/',
		),
		array(
			'', '', '<p style="text-align: center;"><img src="$1" /></p>', '<p style="text-indent: 24px;">', '<p style="text-align: center;">', '', '', '<p style="text-indent: 24px;">', '<p style="text-align: center;">', '<p style="text-align: center;">', '</p>', '<p style="text-indent: 24px;">'
		),
	),
	'tag1_arp'	=> array(
		array(
			'/(?is)<p style="text-align: center;">&nbsp;<\/p>/',
			'/(?is)<strong><\/strong>/'
		),
		array(
			'', ''
		),
	),
    'strip'     => array('tag1' => ''),
	'tag2'      => '/(?is)<div class="daodu">导读：\s*(.*?)\s*<\/div>/',
	'tag2_arp'	=> array(
		array(
			'/(?is)　/'
		),
		array(
			''
		),
	),
    'tag3'      => '/(?is)(中国娱乐网)/',
    'tag4'      => '/(?is)<div class="artInfo"><span>日期：(\d+-\d+-\d+ \d+:\d+:\d+).*?<\/div>/',
);

$map = array(
    'tag'       => 'key0',
    'title'     => 'tag0',
    'content'   => 'tag1',
	'summary'	=> 'tag2',
	'source'    => 'tag3',
	'pub_date'  => 'tag4',
);

$site_list = array(
    'aname'     => '中国娱乐网',
    'domain'    => 'www.67.com',
    'gettype'   => 'default',
    'creg'      => '/(?si)<div class="gallery_list">(.*?)<div class="nt_cl">/',
    'code'      => 'gbk',
    'reg_next'  => '/(?si)<li class="next"><a href="([^"]+?)" target="_self">下一页<\/a><\/li>/',
	//链接
    'tag0'      => '/(?is)<div style="height: 30px;">.*?<a target="_blank"\s*href=\'(\w+:\/\/news\.67\.com\/\w+\/\d+\/\d+\/\d+\/\d+\.\w+)\s*\' style="font-size: 14px;">[^<^>]*?<\/a>.*?<\/div>/',
	//标题
    'tag1'      => '/(?is)<div style="height: 30px;">.*?<a target="_blank"\s*href=\'\w+:\/\/news\.67\.com\/\w+\/\d+\/\d+\/\d+\/\d+\.\w+\s*\' style="font-size: 14px;">([^<^>]*?)<\/a>.*?<\/div>/',
	'tag1_arp'	=> array(
		array(
			'/(?is)\(组图\)/',
			'/(?is)\(图\)/',
			'/(?is)\(图\.\./',
			'/(?is)\(组图\.\./',
			'/(?is)\./',
			'/(?is)(《|》)/',
		),
		array(
			'', '', '', '', '', '',
		)
	),
);

$list_map = array(
    'url'       => 'tag0',
    'title'     => 'tag1',
);

$site_list_sub = array();

2. [文件] get.php ~ 22KB 下载(78) [全屏预览]

3. [文件] std.php ~ 172B 下载(61) 跳至 [1] [3] [4] [5] [全屏预览]

<?php
global $std;
$std = array(
    'url' => '[0-9a-zA-Z\.\:\-\/%_#;&]+',
    'img' => '/(?is)<img.*?src=(?:[\'"]{0,1})([0-9a-zA-Z\.\:\-\/%_#;&]+)(?:[\'"]{0,1}).*?>/',
);

4. [文件] test.php ~ 2KB 下载(62) 跳至 [1] [3] [4] [5] [全屏预览]

<?php
/**      
 * test.php
 *
 * @author     xzfred <xzfred@gmail.com>
 * @copyright  2009 fengone.com
 * @created    2010-12-07 .
 * @version    $Id: php.php 3 2008-10-10 07:49:21Z fred $
 * SVNPath     $HeadURL: http://192.168.0.16/svn/vim/skeletons/php.php $         
 */
/*
include_once "std.php";
include_once "lady_163_com.php";
 */
include_once $GLOBALS['g_dir_core'] . "get.php";

//================================================================================
include_once DIR_HOST_TAG . '/tuku_ent_china_com.php';
$obj = new FcHtmlParse($site);
$c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/2011-08-23/181703.xml"));
echo "\n\n\n ===================\n";
echo $c['field']['tag1'][0];
echo "\n\n\n ===================\n";
var_dump($c);

exit();
//列表测试
$obj = new FcHtmlParse($site_list);
$c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/3569_1.html"));
var_dump($c);
exit();
/*

$obj = new FcHtmlGet($site);
$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');
var_dump($c);

$obj = new FcHtmlGet($site);
$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');
var_dump($c);

$obj = new FcHtmlParse($site);
$img_obj = new FcHtmlImgUpload($site);

$data = file_get_contents("e:/b.html");
$c = $obj->parse($data);
$ic = $img_obj->upload($c['tag']['tag1'][0]);
var_dump($ic);

$data = file_get_contents("e:/a.html");
$c = $obj->parse($data);
$ic = $img_obj->upload($c['tag']['tag1'][0]);
var_dump($ic);
 */

//var_dump($c['tag']['tag1']);

5. [文件] tuku_ent_china_com.php ~ 3KB 下载(14) 跳至 [1] [3] [4] [5] [全屏预览]

<?php
include_once $GLOBALS['g_dir_core'] . 'host/std.php';

$site = array(
    'aname'     => '中华网娱乐图库',
    'domain'    => 'tuku.ent.china.com',
    'dirname'   => '目录名称，用于匹配基于目录不同的正文',
    'gettype'   => 'default',
	//获取主文件
    'creg'      => '/(?si)(<list>.*?<\/list>)/',
    'code'      => 'utf-8',
    'sub'       => '获取子目录正则',
    'content'   => 'tag1',
	'img_upload'=> array('tag1' => ''),
	//下一页
    'reg_next'  => '/(?is)<a class="next" href="(\w+:\/\/\w+.qq.com\/\w+\/\d+\/.*?\.\w+)">下一页<\/a>/',
    'key0'      => '/(?is)<meta name="keywords" content="([^"]*?)".*?\/>/',
    'key0_ap'   => array(array(',', '|'), ' '),
    'tag0'      => '/(?is)title="([^"]*?)"/',
	'tag0_arp'  => array(
		array(
			'/(?is)\(图\)/',
			'/(?is)\&quot;/',
			'/(?is)独家：/',
			'/(?is)独家:/',
			'/(?is)(《|》)/',
		),
		array(
			'', '', '', '', '',
		)
	),
    'tag1'      => '/(?is)<list>(.*?)<\/list>/',
	'tag1_brp'	=> array(
		array(
			'/(?is)\s*<Image title="([^"]*?)" name="" imgurl="[^"]*?" httpurl="([^"]*?)" id="\d+" time="[^"]*?" intro="([^"]*?)" commentsuburl="[^"]*?" commentshowurl="[^"]*?" link="[^"]*?"\/>\s*/'
		),
		array(
			'<p style="text-align: center;"><img src="$2" alt="$1" /></p>
			 <p style="text-align: center;">$1</p>
			 <p style="text-indent: 24px;">$3</p>'
		)
	),
	'tag1_arp'	=> array(
		array(
			'/(?is)<p style="text-indent: 24px;">([^<^>]*?)<\/p>/',
			'/(?is)\&lt;br\/\&gt;/',
		),
		array(
			'<p style="text-indent: 24px;">$1</p><div style="page-break-after: always;"><span style="display: none;">&nbsp;</span></div>',
			'',
		)
	),
    'strip'     => array('tag1' => ''),
	//网名
    'tag3'      => '/(?is)<span class="auth">([^<^>]*?)<\/span>/',
    'tag4'      => '/(?is)(中华网)/'
);

$map = array(
    'tag'       => 'key0',
    'title'     => 'tag0',
    'content'   => 'tag1',
    'author'    => 'tag3',
	'source'    => 'tag4'
);

$site_list = array(
    'aname'     => '中华网娱乐图库',
    'domain'    => 'tuku.ent.china.com',
    'gettype'   => 'default',
    'creg'      => '/(?si)<div class="PicGroupListIndex">(.*?)<div class="MainBtm"><\/div>/',
    'code'      => 'utf-8',
    'reg_next'  => '/(?si)<li class="next"><a href="([^"]+?)" target="_self">下一页<\/a><\/li>/',
	//链接
    'tag0'      => '/(?is)<div class="ImgTitle"><a href="([^"]*?)" target="_blank"><span>.*?<\/span>[^<^>]*?<\/a><\/div>/',
	'tag0_brp'		=> array(
		array(
			'/(?is)\.htm/',
		),
		array(
			'.xml'
		)
	),
	//标题
    'tag1'      => '/(?is)<div class="ImgTitle"><a href="[^"]*?" target="_blank"><span>.*?<\/span>([^<^>]*?)<\/a><\/div>/',
	'tag1_arp'  => array(
		array(
			'/(?is)\(图\)/',
			'/(?is)\&quot;/',
			'/(?is)独家：/',
			'/(?is)独家:/',
			'/(?is)(《|》)/',
		),
		array(
			'', '', '', '', '',
		)
	),
);

$list_map = array(
    'url'       => 'tag0',
    'title'     => 'tag1',
);

$site_list_sub = array();