龙盟编程博客 | 无障碍搜索 | 云盘搜索神器
快速搜索
主页 > web编程 > php编程 >

URL抓取工具

时间:2014-07-22 14:48来源: 作者: 点击:
分享到:
有需要csdn免积分下载、pudn免积分下载、51cto免积分,请到http://www.itziy.com/命令行下执行,直接php调用将显示使用方式功能说明1.支持代理2.支持设置递归检查次数3.支持输出类型控制、检
有需要csdn免积分下载、pudn免积分下载、51cto免积分,请到http://www.itziy.com/
命令行下执行,直接php调用将显示使用方式
功能说明
1.支持代理
2.支持设置递归检查次数
3.支持输出类型控制、检查内容控制

作用:
主要代替肉眼尽量多的抓取可能的请求包及url地址等,方便渗透测试
<?php
error_reporting(E_ERROR | E_WARNING | E_PARSE);
ini_set('memory_limit','1024M');
set_time_limit(0);
define('CHECK_A_TAG', false);
define('CHECK_JS_TAG', true);
define('CHECK_URL', true);
define('SAVE_ERROR', true);

$checkArr = array(
    '$.load',
    '.ajax',
    '$.post',
    '$.get',
    '.getJSON'
);

if ($argc < 2)
    die(showerror('sorry, parameter error', array('example: php debug.php url num filename header proxy', 'detail information:', 'url: target url address which you want to check it', 'num: The number of pages of recursive,default 3', 'filename: output filename default name ret.txt', 'header: The request header file default null', 'proxy: if you want to use proxy set it here default no use proxy')));

if (!check_extension())
    die(showerror('extension curl not support', 'please open php curl extension support'));

//global variable
$url = trim($argv[1]);
if (stripos($url, 'http') === false)
    $url = 'http://'.$url;
$num = isset($argv[2]) ? intval($argv[2]) : 3;
$output = isset($argv[3]) ? trim(str_replace("\\", '/', $argv[3])) : str_replace("\\", '/', dirname(__FILE__)).'/ret.txt';
$header = null;
$proxy = null;
$host = null;

if (isset($argv[4]))
{
    $header = trim(str_replace("\\", '/', $argv[4]));
    if (file_exists($header))
        $header = array_filter(explode("\n", str_replace("\r", '', file_get_contents($header))));
    else
    {
        $file =  str_replace("\\", '/', dirname(__FILE__)).'/'.$header;
        if (file_exists($file))
            $header = array_filter(explode("\n", str_replace("\r", '', file_get_contents($file))));
        else
            $header = null;
    }
}

if (isset($argv[5]))
    $proxy = trim($argv[5]);

if (!is_array($header) || empty($header))
    $header = null;
$result = check_valid_url($url);
$outputArr = array();

if (!empty($result))
{
    $result = str_replace("\r", '', $result);
    $result = str_replace("\n", '', $result);
    $tmpArr = parse_url($url);
    if (!isset($tmpArr['host']))
        die(showerror('parse url error', 'can not get host form url: '.$url));
    $host = $tmpArr['host'];
    if (stripos($host, 'http') === false)
        $host = 'http://'.$host;
    unset($tmpArr);

    //check for current page
    if (!isset($outputArr[md5($url)]))
    {
        $outputArr[md5($url)] = $url;
        file_put_contents($output, $url."\n", FILE_APPEND);
        echo 'url: ',$url,' find ajax require so save it',PHP_EOL;
    }
    work($result);
}
echo 'run finish',PHP_EOL;

function work($result, $reverse = false)
{
    global $num, $host, $outputArr, $checkArr, $output;

    if (!$result)
        return;
    $result = str_replace("\r", '', $result);
    $result = str_replace("\n", '', $result);

    while ($num > 0)
    {
        echo 'remain: ',$num,' now start to check for url address',PHP_EOL,PHP_EOL;
        preg_match_all('/<a.*?href\s*=\s*("|\'|\s)+(.*?)("|\'|\s)+.*?>/i', $result, $match);
        if (CHECK_A_TAG && isset($match[2]) && !empty($match[2]))
        {
            foreach ($match[2] as $mc)
            {
                $mc = trim($mc);
                if ($mc == '#')
                    continue;
                if (stripos($mc, 'http') === false)
                    $mc = $host.$mc;

                if (($ret = check_valid_url($mc)))
                {
                    if (!isset($outputArr[md5($mc)]))
                    {
                        $outputArr[md5($mc)] = $mc;
                        file_put_contents($output, $mc."\n", FILE_APPEND);
                        echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
                    }
                }
            }
        }

        //check for page url
        echo 'remain: ',$num,' now start to check for page url',PHP_EOL,PHP_EOL;
        preg_match_all('/(https?|ftp|mms):\/\/([A-z0-9]+[_\-]?[A-z0-9]+\.)*[A-z0-9]+\-?[A-z0-9]+\.[A-z]{2,}(\/.*)*\/?/i', $result, $match);
        if (CHECK_URL && isset($match[2]) && !empty($match[2]))
        {
            foreach ($match[2] as $mc)
            {
                $mc = trim($mc);
                if ($mc == '#')
                    continue;
                if (stripos($mc, 'http') === false)
                    $mc = $host.$mc;

                if (($ret = check_valid_url($mc)))
                {
                    if (!isset($outputArr[md5($mc)]))
                    {
                        $outputArr[md5($mc)] = $mc;
                        file_put_contents($output, $mc."\n", FILE_APPEND);
                        echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
                    }
                }
            }
        }

        //check for javascript ajax require
        echo 'remain: ',$num,' now start to check for javascript ajax require',PHP_EOL,PHP_EOL;
        preg_match_all('/<script.*?src\s*=\s*("|\'|\s)+(.*?)("|\'|\s)+.*?>/i', $result, $match);
        if (CHECK_JS_TAG && isset($match[2]) && !empty($match[2]))
        {
            foreach ($match[2] as $mc)
            {
                $mc = trim($mc);
                if ($mc == '#')
                    continue;
                if (stripos($mc, 'http') === false)
                    $mc = $host.$mc;

                if (($ret = check_valid_url($mc)))
                {
                    //check for current page
                    foreach ($checkArr as $ck)
                    {
                        if (!isset($outputArr[md5($mc)]) && strpos($ret, $ck) !== false)
                        {
                            $outputArr[md5($mc)] = $mc;
                            file_put_contents($output, $mc."\n", FILE_APPEND);
                            echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
                            break;
                        }
                    }
                }
            }
        }

        if ($reverse)
            return;

        //check for next page
        preg_match_all('/<a.*?href\s*=\s*("|\'|\s)+(.*?)("|\'|\s)+.*?>/i', $result, $match);
        if (isset($match[2]) && !empty($match[2]))
        {
            echo 'check for next page, remain page counts: ',$num,PHP_EOL;
            foreach ($match[2] as $mc)
            {
                $mc = trim($mc);
                if ($mc == '#')
                    continue;
                if (stripos($mc, 'http') === false)
                    $mc = $host.$mc;
                echo 'check for next page: ',$mc,PHP_EOL;
                work(check_valid_url($mc), true);
            }
        }
        $num--;
        sleep(3);
    }
}

function check_valid_url($url)
{
    if (stripos($url, 'http') === false)
        $url = 'http://'.$url;
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_HEADER, true);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
    if (!is_null($header))
        curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
    if (!is_null($proxy))
        curl_setopt($ch, CURLOPT_PROXY, $proxy);
    $ret = curl_exec($ch);
    $errinfo = curl_error($ch);
    curl_close($ch);
    unset($ch);
    if (!empty($errinfo) || ((strpos($ret, '200 OK') === false) && (strpos($ret, '302 Moved') === false)) || strpos($ret, '114so.cn') !== false)
    {
        showerror('check url: '.$url. ' find some errors', array($errinfo, $ret));
        if (SAVE_ERROR)
            file_put_contents(dirname(__FILE__).'/error.txt', $url."\n", FILE_APPEND);
        return false;
    }
    return $ret;
}

function check_extension()
{
    if (!function_exists('curl_init') || !extension_loaded('curl'))
        return false;
    return true;
}

function showerror($t, $c)
{
    $str = "#########################################################################\n";
    $str .= "#  ".$t."\n";
    if (is_string($c))
        $str .= "#  ".$c;
    elseif (is_array($c) && !empty($c))
    {
        foreach ($c as $c1)
            $str .= "#  ".$c1."\n";
    }
    $str .= "\n#########################################################################\n";
    echo $str;
    unset($str);
}
精彩图集

赞助商链接