龙盟编程博客 | 无障碍搜索 | 云盘搜索神器
快速搜索
主页 > web编程 > php编程 >

多线程 QQ 号码爬虫

时间:2014-07-22 14:48来源: 作者: 点击:
分享到:
通过空间历史浏览,爬出查看你空间的人(一般限制20人,除非开通黄钻),然后在爬出这20人的浏览记录,依次向下爬,你可以控制爬行深度。这里仅仅给出怕中代码片段,你可以进一
通过空间历史浏览,爬出查看你空间的人(一般限制20人,除非开通黄钻),然后在爬出这20人的浏览记录,依次向下爬,你可以控制爬行深度。
这里仅仅给出怕中代码片段,你可以进一步优化,将QQ分类存储。通过QQ相互浏览关系,可以通过绘图工具绘制好友网络。等等
欢迎跟过讨论,请加QQ群:128659835 注明“读者”

代码涉及pthreads 如果不清楚请阅读:《PHP 高级编程之多线程》
http://netkiller.github.io/journal/thread.php.html
<?php
/*
Homepage: http://netkiller.github.io
Author: Neo <netkiller@msn.com>
*/
if(!extension_loaded('pthreads')) die ('Please install pthreads');

include_once('Snoopy.class.php');

class CrawlerWorker extends Worker {

	protected  static $dbh;
	public function __construct() {

	}
	public function run(){
	/*
		$dbhost = 'db.example.com';			// 数据库服务器
	    $dbuser = 'example.com';        	// 数据库用户名
        $dbpw = 'password';             	// 数据库密码
		$dbname = 'example';				// 数据库名

		self::$dbh  = new PDO("mysql:host=$dbhost;port=3306;dbname=$dbname", $dbuser, $dbpw, array(
			PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES \'UTF8\'',
			PDO::MYSQL_ATTR_COMPRESS => true,
			PDO::ATTR_PERSISTENT => true
			)
		);
	*/
	}
	protected function getInstance(){
        return self::$dbh;
    }

}

/* the collectable class implements machinery for Pool::collect */
class Crawler extends Stackable {
	public $depth = 3;
	private static $level = 0;
	public function __construct($qq) {
		$this->qq = $qq;
	}
	public function run() {

		try {
			$dbh  = $this->worker->getInstance();
			$this->recursion(array($this->qq));
		}
		catch(PDOException $e) {
			$error = sprintf("%s,%s\n", $mobile, $id );
			file_put_contents("mobile_error.log", $error, FILE_APPEND);
		}
		//printf("runtime: %s, %s\n", date('Y-m-d H:i:s'), $this->worker->getThreadId());
		//$lst = $this->qzone($this->qq);
		//print_r($lst);
	}
	public function recursion($qqs){
		
		if( self::$level <= $this->depth){
			self::$level++;
		}else if(self::$level > 0){
			self::$level--;
		}
		printf("Level: %s\n", self::$level);
		//sleep(1);
		usleep(mt_rand(10000,1000000));
		if(self::$level >= $this->depth){
			return;
		}
		
		foreach($qqs as $uin) {
			$lst = $this->qzone($uin);
			print_r($lst);
			$this->recursion($lst);
		}
	}

	public function qzone($qq){
		$url = 'http://m.qzone.com/mqz_get_visitor?g_tk=1191852101&res_mode=0&res_uin='.$qq.'&offset=0&count=100&page=1&format=json&t=1401762986882&sid=dODKVcYv6azjN87cxXQ5mao1xgakYjHg18c8aa5e0201%3D%3D';
		$snoopy = new Snoopy;
		 
		// need an proxy?
		//$snoopy->proxy_host = "my.proxy.host";
		//$snoopy->proxy_port = "8080";
		 
		// set browser and referer:
		$snoopy->agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
		$snoopy->referer = "http://m.qzone.com/";
		 
		// set some cookies:
		//$snoopy->cookies["SessionID"] = '238472834723489';
		//$snoopy->cookies["favoriteColor"] = "blue";
		 
		// set an raw-header:
		$snoopy->rawheaders["Pragma"] = "no-cache";
		 
		// set some internal variables:
		$snoopy->maxredirs = 2;
		$snoopy->offsiteok = false;
		$snoopy->expandlinks = false;
		 
		// set username and password (optional)
		//$snoopy->user = "joe";
		//$snoopy->pass = "bloe";
		 
		// fetch the text of the website www.google.com:
		if($snoopy->fetchtext($url)){ 
			// other methods: fetch, fetchform, fetchlinks, submittext and submitlinks

			// response code:
			//print "response code: ".$snoopy->response_code."<br/>\n";
		 
			// print the headers:
			//print "<b>Headers:</b><br/>";
			//while(list($key,$val) = each($snoopy->headers)){
			//	print $key.": ".$val."<br/>\n";
			/
      
精彩图集

赞助商链接