龙盟编程博客 | 无障碍搜索 | 云盘搜索神器
快速搜索
主页 > web编程 > php编程 >

英文分句/分段落

时间:2014-07-22 14:49来源: 作者: 点击:
分享到:
php环境,指出一个原本分句的时候的误区,分句不用考虑小数点,不用考虑域名,因为标准的句子是句号后面加空格的,唯一要考虑的就是Mr. Li 这种。先采用分段落的方式是考虑到有些
php环境,指出一个原本分句的时候的误区,分句不用考虑小数点,不用考虑域名,因为标准的句子是句号后面加空格的,唯一要考虑的就是Mr. Li  这种。
先采用分段落的方式是考虑到有些引用采用冒号结尾。
<?php

/*TWWY'S ART*/

function break_passage($text){			//分割段落
	return preg_split("/(\r|\n|\r\n)/", $text, -1, PREG_SPLIT_NO_EMPTY);
}

function break_sentence($text){		//分割句子   英文的句号后面必须有空格
	$re = '/# Split sentences on whitespace between them.
    (?<=                # Begin positive lookbehind.
      [.!?]             # Either an end of sentence punct,
    | [.!?][\'"]        # or end of sentence punct and quote.
    )                   # End positive lookbehind.
    (?<!                # Begin negative lookbehind.
      Mr\.              # Skip either "Mr."
    | Mrs\.             # or "Mrs.",
    | Ms\.              # or "Ms.",
    | Jr\.              # or "Jr.",
    | Dr\.              # or "Dr.",
    | Prof\.            # or "Prof.",
    | Sr\.              # or "Sr.",
                        # or... (you get the idea).
    )                   # End negative lookbehind.
    \s+                 # Split on whitespace between sentences.
    /ix';
	$sentences = preg_split($re, $text, -1, PREG_SPLIT_NO_EMPTY);
	return $sentences;
}

function get_sentence($text){		//先分割段落再分割句子 [推荐]
	$passage = break_passage($text);
	$return = array();
	foreach ($passage as $key => $value) $return = array_merge($return, break_sentence($value));
	return $return;
}

?>
精彩图集

赞助商链接