Blankspider PHP 爬虫
目录: br / robots.php br / iptable.conf br / ./robots br / br / bspan style=color:#e53333;iptable.conf/span/b br / ------------ 内容 ------ br / http://www.baidu.com br / http://www.sogou.com br / http://www.iapall.com br / http://www.
目录:
robots.php
iptable.conf
./robots
iptable.conf
<<------------ 内容 ------
http://www.baidu.com
http://www.sogou.com
http://www.iapall.com
http://www.oschina.net
--------- 内容 -------->>
robots.php
iptable.conf
./robots
iptable.conf
<<------------ 内容 ------
http://www.baidu.com
http://www.sogou.com
http://www.iapall.com
http://www.oschina.net
--------- 内容 -------->>
function spider($url, $spider = 'Blankspider', $port = 80, $timeout = 15) { $content= ''; $resolve = parse_url($url); $host = $resolve['host']; $path = empty($resolve['path']) ? '/' : $resolve['path'].(!empty($resolve['query']) ? '?'.$resolve['query'] : ''); if(empty($host)) { return 'Requested host name can\'t be empty'; } $fp = fsockopen($host, $port, $errno, $errstr, $timeout); if (!$fp) { return $errstr; } else { $fputs = "GET $path HTTP/1.1\r\n"; $fputs.= "Accept: */*\r\n"; $fputs.= "Host: $host\r\n"; $fputs.= "Spider: $spider\r\n"; $fputs.= "Connection: Close\r\n\r\n"; stream_set_blocking($fp, 1); stream_set_timeout($fp, $timeout); fputs($fp, $fputs); while(!feof($fp)) { if(($return = fgets($fp)) && ($return == "\r\n" || $return == "\n")) { break; } } while(!feof($fp)) { $content .= fgets($fp, 8192); } fclose($fp); return $content; } } function cron2spider($iptable, $sleep = 5) { set_time_limit(0); $i = 0; date_default_timezone_set('PRC'); if(!file_exists($iptable)) { return json_encode(array( 'status'=> 'error', 'description'=> 'iptable.conf file not exists')); } $file = file($iptable); if(empty($file)) { return json_encode(array( 'status'=> 'error', 'description'=> 'iptable.conf can\'t be empty')); } while($i< count($file)) { if(!file_exists('robots') || !is_writable('robots')) { return json_encode(array( 'status'=> 'error', 'description'=> 'directory doesn\'t exist or don\'t have write permissions')); } $dir = 'robots/'.preg_replace('/(http\:\/\/)|(\s)|(www\.)/', '', $file[$i]); if(!file_exists($dir)){ mkdir($dir); } file_put_contents($dir.'/'.date('Y.m.d.H.i.s', time()).'.txt', spider(preg_replace('/\s/', '', $file[$i]))); $i++; sleep($sleep); } return json_encode(array( 'status'=> 'ok', 'description'=> 'robots program execution success')); } echo cron2spider('iptable.conf');
- 上一篇:模仿新浪微博换一换
- 下一篇:php版 筛选需要的文件,生成站点同样形式的目录
精彩图集
精彩文章