从请求的页面提取关键词

时间:2014-07-22 14:48来源: 作者: 点击: 次

分享到：

它可以从一个给定的URL检索网页提取出一些关键词例如从代码珠玑的首页可以提取出类似下面图片中的关键词

它可以从一个给定的URL检索网页提取出一些关键词

例如从代码珠玑的首页可以提取出类似下面图片中的关键词

<?php 
 
if(!empty($_REQUEST["url"])){
 
        include 'class.keywords.php';
 
        $keywords = new keywordsugest();
        $keywords->_lang = 'es';
        $keywords->_encoding = 'iso-8859-1';
        $keywords->_catego = 'telecom';
        $keywords->_keyCount = 100; // is like the porcent %
        $keywords->file($_REQUEST['url']);
 
        #$keywords->readMetaKeyWords();
 
        #$keywords->readHtmlKeyWords();
 
        $keywords->readAll();
 
        echo 'Keywords found :</br></br>';
 
        $i = 1;
 
        foreach($keywords->get() as $word) echo $i++.". $word<br>";
}
//url例如：http://www.codepearl.com
echo "<form method='post'><input type='text' name='url'><input type='submit'></form>";
         
 
?>

3. [代码][PHP]代码跳至 [2] [3] [全屏预览]

<?php
   
  class keywordsugest{
       
        var $_html        =    FALSE;
        var $_keyCount    =    5;
        var $_keyWords    =    array();
        var $_encoding    =    'UTF-8';    
        var $_lang        =    'es'; 
        var $_catego      =    'telecom';  
        var $_url         =    '';
 
        /**
        * # read meta keywords
        * 
        */
        public function readMetaKeyWords()    {
 
            if (! $this->_html) return;
 
            preg_match('/<[\s]*meta[\s]*name[\s]*=[\s]*\"[\s]*keywords[\s]*\"[\s]*content[\s]*=[\s]*\"?([^>"]*)\"?[\s]*[\/]?[\s]*>/is', $this->_html, $match);
 
            //$tags = get_meta_tags($this->_url);
            //echo $tags['keywords'];     
 
             
            if (count($match))    {
                $this->_keyWords = array_unique(explode(',', preg_replace('/\s/i', ' ', mb_strtolower($match[1], $this->_encoding))));
            }
        }
         
       /**
       * strip tags
       *  
       * @param mixed $string
       */
       private function rip_tags($string) {
    
    // ----- remove HTML TAGs -----
    $string = preg_replace ('/<[^>]*>/', ' ', $string);
    
    /* // ----- remove control characters -----
    $string = str_replace("\r", '', $string);    // --- replace with empty space
    $string = str_replace("\n", ' ', $string);   // --- replace with space
    $string = str_replace("\t", ' ', $string);   // --- replace with space
   */
    // ----- remove multiple spaces -----
    $string = trim(preg_replace('/ {2,}/', ' ', $string));
    
    return $string;
 
}
 
        /**
        * # read keywords from page body or string
        *
        */
        public function readHtmlKeyWords()    {
 
            if (! $this->_html) return;
             
            if(!empty($this->_keyWords)){
                $implo = implode(' ',$this->_keyWords);
                $this->_html = $this->_html." ".$implo;
                $this->_keyWords = array();
            }
             
            $this->_html = str_replace('&nbsp;',' ', $this->_html);
 
            # remove unneeded parts
            $toRemove = array('head', 'script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
 
            foreach ($toRemove as $remove)    $this->_html = preg_replace("/\<\s*$remove.*?\>.*?\<\s*\/\s*$remove\s*\>/is", ' ', $this->_html);
 
            # remove comments
            $this->_html = preg_replace("/\<\s*!--.*?-->/is", ' ', $this->_html);
             
            # delete html tags
            $this->_html = mb_strtolower($this->rip_tags($this->_html), $this->_encoding);
 
            $this->_html = htmlspecialchars_decode($this->_html);
             
            # decode encoded hmtl entities
            $this->_html = html_entity_decode ($this->_html, ENT_COMPAT, $this->_encoding);
             
            # break into words
            $words = preg_split("/[\s]+|[\t]+|[\.]+|[\,]+|[\:]+|[\;]+|[\!]+|[\?]+|[\|]+/s", $this->_html, -1, PREG_SPLIT_NO_EMPTY);     
             
            if (count($words))    {
 
                $frequency = array_count_values($words);
                unset($frequency['']);
 
                if (count($frequency))    {
 
                    # delete stop words and interpunctions
                    include('stopwords_'.$this->_lang.'.php');
                    include('glodic_'.$this->_catego.'_'.$this->_lang.'.php');
 
                    $punct = '~!@#$%^&*()_+|}{[];:\'\",<.>/?`-=\\';
 
                    foreach (array_keys($frequency) as $word)    {
                        if (    (in_array($word, $stopWords)) or (strspn($word, $punct) == strlen($word))  ){ unset($frequency[$word]); }
                    }
 
                    $max = max($frequency);
                    $count = count($frequency);
                    $tot = round(($max * 100) / $count);
                    $tot2 = round(($this->_keyCount * 100) / $count);
                    if($tot > $count){$tot = $tot / 2;} 
                    if($tot2 > $count){$tot = $tot / 2;} 
                    $showmax = round(($tot + $tot2) / 2);
                     
                    foreach (array_keys($frequency) as $word)    {
                        if (    in_array($word, $glodic)   ){$frequency[$word] = $frequency[$word] + $showmax; }
                    }
                     
                    # sort by frequency
                    arsort($frequency, SORT_NUMERIC);
                         
                    # add them to keyword array
                    $i = 0;
 
                    foreach ($frequency as $word=>$count)    {
 
                       if ( (! in_array($word, $this->_keyWords)) &&
                            (! is_numeric($word)) && 
                            (! empty($word)) ) {
                            $this->_keyWords[] = (string)$word;
 
                            $i++;
 
                            if ($i == $showmax) break;
                        }
                   }
                }
            }
        }
 
       
        /**
        * change the encoding from default utf-8
        * 
        * @param mixed $enc
        */
        private function encoding($enc = FALSE)    {
 
            if ($enc) $this->_encoding = $enc;  
        }
 
         
        /**
        * # reads from file or url
        * 
        * @param mixed $fileUrl
        */
        public function file($fileUrl = FALSE)    {
             
            if ($fileUrl){ $this->_html = @file_get_contents($fileUrl);
            $this->_url = $fileUrl;
            }
        }
 
        
 
        /**
        *  # define html as string
        * 
        * @param mixed $page
        */
        public function html($page = FALSE)    {
 
            if ($page) $this->_html = $page;
        }
 
         
        /**
        * # reads both meta keywords and from body
        * 
        */
        public function readAll()    {
 
            if ($this->_html !== FALSE)    {
 
                $this->readMetaKeyWords();
 
                $this->readHtmlKeyWords();
            }
 
            $this->_keyWords = array_unique($this->_keyWords);
        }
 
        
        /**
        *  # returns keywords as array
        * 
        */
        public function get()    {
 
            return $this->_keyWords;
        }    
  }
   
?>