php 利用curl,正则表达式做的一个php蜘蛛抓取器(2)
std.php ?phpglobal $std;$std = array( 'url' = '[0-9a-zA-Z\.\:\-\/%_#;]+', 'img' = '/(?is)img.*?src=(?:[\']{0,1})([0-9a-zA-Z\.\:\-\/%_#;]+)(?:[\']{0,1}).*?/',); test.php ?php/** * test.php * * @author
std.php
<?php
global $std;
$std = array(
'url' => '[0-9a-zA-Z\.\:\-\/%_#;&]+',
'img' => '/(?is)<img.*?src=(?:[\'"]{0,1})([0-9a-zA-Z\.\:\-\/%_#;&]+)(?:[\'"]{0,1}).*?>/',
);
test.php
<?php
/**
* test.php
*
* @author xzfred <xzfred@gmail.com>
* @copyright 2009 fengone.com
* @created 2010-12-07 .
* @version $Id: php.php 3 2008-10-10 07:49:21Z fred $
* SVNPath $HeadURL: http://192.168.0.16/svn/vim/skeletons/php.php $
*/
/*
include_once "std.php";
include_once "lady_163_com.php";
*/
include_once $GLOBALS['g_dir_core'] . "get.php";
//================================================================================
include_once DIR_HOST_TAG . '/tuku_ent_china_com.php';
$obj = new FcHtmlParse($site);
$c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/2011-08-23/181703.xml"));
echo "\n\n\n ===================\n";
echo $c['field']['tag1'][0];
echo "\n\n\n ===================\n";
var_dump($c);
exit();
//列表测试
$obj = new FcHtmlParse($site_list);
$c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/3569_1.html"));
var_dump($c);
exit();
/*
$obj = new FcHtmlGet($site);
$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');
var_dump($c);
$obj = new FcHtmlGet($site);
$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');
var_dump($c);
$obj = new FcHtmlParse($site);
$img_obj = new FcHtmlImgUpload($site);
$data = file_get_contents("e:/b.html");
$c = $obj->parse($data);
$ic = $img_obj->upload($c['tag']['tag1'][0]);
var_dump($ic);
$data = file_get_contents("e:/a.html");
$c = $obj->parse($data);
$ic = $img_obj->upload($c['tag']['tag1'][0]);
var_dump($ic);
*/
//var_dump($c['tag']['tag1']);
tuku_ent_china_com.php
<?php
include_once $GLOBALS['g_dir_core'] . 'host/std.php';
$site = array(
'aname' => '中华网娱乐图库',
'domain' => 'tuku.ent.china.com',
'dirname' => '目录名称,用于匹配基于目录不同的正文',
'gettype' => 'default',
//获取主文件
'creg' => '/(?si)(<list>.*?<\/list>)/',
'code' => 'utf-8',
'sub' => '获取子目录正则',
'content' => 'tag1',
'img_upload'=> array('tag1' => ''),
//下一页
'reg_next' => '/(?is)<a class="next" href="(\w+:\/\/\w+.qq.com\/\w+\/\d+\/.*?\.\w+)">下一页<\/a>/',
'key0' => '/(?is)<meta name="keywords" content="([^"]*?)".*?\/>/',
'key0_ap' => array(array(',', '|'), ' '),
'tag0' => '/(?is)title="([^"]*?)"/',
'tag0_arp' => array(
array(
'/(?is)\(图\)/',
'/(?is)\"/',
'/(?is)独家:/',
'/(?is)独家:/',
'/(?is)(《|》)/',
),
array(
'', '', '', '', '',
)
),
'tag1' => '/(?is)<list>(.*?)<\/list>/',
'tag1_brp' => array(
array(
'/(?is)\s*<Image title="([^"]*?)" name="" imgurl="[^"]*?" httpurl="([^"]*?)" id="\d+" time="[^"]*?" intro="([^"]*?)" commentsuburl="[^"]*?" commentshowurl="[^"]*?" link="[^"]*?"\/>\s*/'
),
array(
'<p style="text-align: center;"><img src="$2" alt="$1" /></p>
<p style="text-align: center;">$1</p>
<p style="text-indent: 24px;">$3</p>'
)
),
'tag1_arp' => array(
array(
'/(?is)<p style="text-indent: 24px;">([^<^>]*?)<\/p>/',
'/(?is)\<br\/\>/',
),
array(
'<p style="text-indent: 24px;">$1</p><div style="page-break-after: always;"><span style="display: none;"> </span></div>',
'',
)
),
'strip' => array('tag1' => ''),
//网名
'tag3' => '/(?is)<span class="auth">([^<^>]*?)<\/span>/',
'tag4' => '/(?is)(中华网)/'
);
$map = array(
'tag' => 'key0',
'title' => 'tag0',
'content' => 'tag1',
'author' => 'tag3',
'source' => 'tag4'
);
$site_list = array(
'aname' => '中华网娱乐图库',
'domain' => 'tuku.ent.china.com',
'gettype' => 'default',
'creg' => '/(?si)<div class="PicGroupListIndex">(.*?)<div class="MainBtm"><\/div>/',
'code' => 'utf-8',
'reg_next' => '/(?si)<li class="next"><a href="([^"]+?)" target="_self">下一页<\/a><\/li>/',
//链接
'tag0' => '/(?is)<div class="ImgTitle"><a href="([^"]*?)" target="_blank"><span>.*?<\/span>[^<^>]*?<\/a><\/div>/',
'tag0_brp' => array(
array(
'/(?is)\.htm/',
),
array(
'.xml'
)
),
//标题
'tag1' => '/(?is)<div class="ImgTitle"><a href="[^"]*?" target="_blank"><span>.*?<\/span>([^<^>]*?)<\/a><\/div>/',
'tag1_arp' => array(
array(
'/(?is)\(图\)/',
'/(?is)\"/',
'/(?is)独家:/',
'/(?is)独家:/',
'/(?is)(《|》)/',
),
array(
'', '', '', '', '',
)
),
);
$list_map = array(
'url' => 'tag0',
'title' => 'tag1',
);
$site_list_sub = array();
精彩图集
精彩文章






