龙盟编程博客 | 无障碍搜索 | 云盘搜索神器
快速搜索
主页 > web编程 > php编程 >

php Blankspider PHP 爬虫

时间:2014-07-20 15:55来源:网络整理 作者:网络 点击:
分享到:
Blankspider PHP 爬虫 [代码片段(64行)]
function spider($url, $spider = 'Blankspider', $port = 80, $timeout = 15) {
    $content= '';
    $resolve = parse_url($url);
    $host = $resolve['host'];
    $path = empty($resolve['path']) ? '/' : $resolve['path'].(!empty($resolve['query']) ? '?'.$resolve['query'] : '');
    if(empty($host)) { return 'Requested host name can\\'t be empty'; }
    $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
    if (!$fp) {
        return $errstr;
    } else {
        $fputs = "GET $path HTTP/1.1\\r\\n";
        $fputs.= "Accept: */*\\r\\n";
        $fputs.= "Host: $host\\r\\n";
        $fputs.= "Spider: $spider\\r\\n";
        $fputs.= "Connection: Close\\r\\n\\r\\n";
        stream_set_blocking($fp, 1);
        stream_set_timeout($fp, $timeout);
        fputs($fp, $fputs);
        while(!feof($fp)) {
            if(($return = fgets($fp)) && ($return == "\\r\\n" || $return == "\\n")) {
                break;
            }
        }
        while(!feof($fp)) { $content .= fgets($fp, 8192); }
        fclose($fp);
        return $content;
    }

}

function cron2spider($iptable, $sleep = 5) {
    set_time_limit(0);
    $i = 0;
    date_default_timezone_set('PRC');
    if(!file_exists($iptable)) {
        return json_encode(array(
        'status'=> 'error',
        'description'=> 'iptable.conf file not exists'));
    }
    $file = file($iptable);
    if(empty($file)) {
        return json_encode(array(
        'status'=> 'error',
        'description'=> 'iptable.conf can\\'t be empty'));
    }
    while($i< count($file)) {
        if(!file_exists('robots') || !is_writable('robots')) {
            return json_encode(array(
            'status'=> 'error',
            'description'=> 'directory doesn\\'t exist or don\\'t have write permissions'));
        }
        $dir = 'robots/'.preg_replace('/(http\\:\\/\\/)|(\\s)|(www\\.)/', '', $file[$i]);
        if(!file_exists($dir)){ mkdir($dir); }
        file_put_contents($dir.'/'.date('Y.m.d.H.i.s', time()).'.txt', spider(preg_replace('/\\s/', '', $file[$i])));
        $i++;
        sleep($sleep);
    }
    return json_encode(array(
    'status'=> 'ok',
    'description'=> 'robots program execution success'));
}

echo cron2spider('iptable.conf');
//该片段来自于http://outofmemory.cn
精彩图集

赞助商链接