1 Star 0 Fork 2

no2key/scws4

forked from xinghuo/scws4 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
pscws4.class.php 30.67 KB
一键复制 编辑 原始数据 按行查看 历史
xinghuo 提交于 2013-12-23 17:45 . init
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174
<?php
/* ----------------------------------------------------------------------- *\
PHP版简易中文分词第四版(PSCWS v4.0) - 分词核心类库代码
-----------------------------------------------------------------------
作者: 马明练(hightman) (MSN: [email protected]) (php-QQ群: 17708754)
网站: http://www.ftphp.com/scws/
时间: 2007/05/20
修订: 2008/12/20
编辑: set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
-----------------------------------------------------------------------
核心类的功能:
这是 scws-1.0 (纯C实现) 的一个 PHP 实现方式, 算法和功能一样
针对输入的字符串文本执行分词, 根据词典N-路径最大概率法分词.
支持人名、地名、数字识别;能识别 .NET, C++, Q币 之类特殊词汇
支持 UTF-8/GBK 编码, 特别为搜索引擎考量而支持长词再细分的复方分词法
使用 UTF-8 可扩展到任何多字节语言分词(如日语,韩语等)
用法(主要类方法, 与 scws 之 PHP 扩展版兼容用法):
class PSCWS4 {
void close(void);
void set_charset(string charset);
bool set_dict(string dict_path);
void set_rule(string rule_path);
void set_ignore(bool set);
void set_multi(int level);
void set_debug(bool set);
void set_duality(bool set);
void send_text(string text);
mixed get_result(void);
mixed get_tops( [int limit [, string attr]] );
string version(void);
};
\* ----------------------------------------------------------------------- */
/** 词典读取代码 (xdb_r) */
require_once (dirname(__FILE__) . '/xdb_r.class.php');
/** defines for ruleset */
define ('PSCWS4_RULE_MAX', 31); // just 31, PHP do not support unsigined Int
define ('PSCWS4_RULE_SPECIAL', 0x80000000);
define ('PSCWS4_RULE_NOSTATS', 0x40000000);
define ('PSCWS4_ZRULE_NONE', 0x00);
define ('PSCWS4_ZRULE_PREFIX', 0x01);
define ('PSCWS4_ZRULE_SUFFIX', 0x02);
define ('PSCWS4_ZRULE_INCLUDE', 0x04); // with include
define ('PSCWS4_ZRULE_EXCLUDE', 0x08); // with exclude
define ('PSCWS4_ZRULE_RANGE', 0x10); // with znum range
/** defines for mode of scws <= 0x800 */
define ('PSCWS4_IGN_SYMBOL', 0x01);
define ('PSCWS4_DEBUG', 0x02);
define ('PSCWS4_DUALITY', 0x04);
/** multi segment policy >= 0x1000 */
define ('PSCWS4_MULTI_NONE', 0x0000); // nothing
define ('PSCWS4_MULTI_SHORT', 0x1000); // split long words to short words from left to right
define ('PSCWS4_MULTI_DUALITY', 0x2000); // split every long words(3 chars?) to two chars
define ('PSCWS4_MULTI_ZMAIN', 0x4000); // split to main single chinese char atr = j|a|n?|v?
define ('PSCWS4_MULTI_ZALL', 0x8000); // attr = ** , all split to single chars
define ('PSCWS4_MULTI_MASK', 0xf000); // mask check for multi set
define ('PSCWS4_ZIS_USED', 0x8000000);
/** single bytes segment flag (纯单字节字符) */
define ('PSCWS4_PFLAG_WITH_MB', 0x01);
define ('PSCWS4_PFLAG_ALNUM', 0x02);
define ('PSCWS4_PFLAG_VALID', 0x04);
define ('PSCWS4_PFLAG_DIGIT', 0x08);
define ('PSCWS4_PFLAG_ADDSYM', 0x10);
/** constant var define */
define ('PSCWS4_WORD_FULL', 0x01); // 多字: 整词
define ('PSCWS4_WORD_PART', 0x02); // 多字: 前词段
define ('PSCWS4_WORD_USED', 0x04); // 多字: 已使用
define ('PSCWS4_WORD_RULE', 0x08); // 多字: 自动识别的
define ('PSCWS4_ZFLAG_PUT', 0x02); // 单字: 已使用
define ('PSCWS4_ZFLAG_N2', 0x04); // 单字: 双字名词头
define ('PSCWS4_ZFLAG_NR2', 0x08); // 单字: 词头且为双字人名
define ('PSCWS4_ZFLAG_WHEAD', 0x10); // 单字: 词头
define ('PSCWS4_ZFLAG_WPART', 0x20); // 单字: 词尾或词中
define ('PSCWS4_ZFLAG_ENGLISH', 0x40); // 单字: 夹在中间的英文
define ('PSCWS4_ZFLAG_SYMBOL', 0x80); // 单字: 符号系列
define ('PSCWS4_MAX_EWLEN', 16);
define ('PSCWS4_MAX_ZLEN', 128);
/** 主类库代码 */
class PSCWS4
{
var $_xd; // xdb dict handler
var $_rs; // ruleset resource
var $_rd; // ruleset data
var $_cs = ''; // charset
var $_ztab; // zi len table
var $_mode = 0; // scws mode
var $_txt; // text string
var $_res;
var $_zis; // z if used?(duality)
var $_off = 0;
var $_len = 0;
var $_wend = 0;
var $_wmap;
var $_zmap;
// 构造函数
function CPSCWS4($charset = 'gbk')
{
$this->_xd = false;
$this->_rs = $this->_rd = array();
$this->set_charset($charset);
}
// FOR PHP5
function __construct() { $this->CPSCWS4(); }
function __destruct() { $this->close(); }
// 设置字符集(ztab)
function set_charset($charset = 'gbk')
{
$charset = strtolower(trim($charset));
if ($charset !== $this->_cs)
{
$this->_cs = $charset;
// charset's mblen map, only for utf-8 & gbk(big5)
$this->_ztab = array_fill(0, 0x81, 1);
if ($charset == 'utf-8' || $charset == 'utf8')
{
// UTF-8
$this->_ztab = array_pad($this->_ztab, 0xc0, 1);
$this->_ztab = array_pad($this->_ztab, 0xe0, 2);
$this->_ztab = array_pad($this->_ztab, 0xf0, 3);
$this->_ztab = array_pad($this->_ztab, 0xf8, 4);
$this->_ztab = array_pad($this->_ztab, 0xfc, 5);
$this->_ztab = array_pad($this->_ztab, 0xfe, 6);
$this->_ztab[] = 1;
}
else
{
// GBK & BIG5
$this->_ztab = array_pad($this->_ztab, 0xff, 2);
}
$this->_ztab[] = 1;
}
}
// 设置词典
function set_dict($fpath)
{
$xdb = new XDB_R;
if (!$xdb->Open($fpath)) return false;
$this->_xd = $xdb;
}
// 设置规则集
function set_rule($fpath)
{
$this->_rule_load($fpath);
}
// 设置忽略符号与无用字符
function set_ignore($yes)
{
if ($yes == true) $this->_mode |= PSCWS4_IGN_SYMBOL;
else $this->_mode &= ~PSCWS4_IGN_SYMBOL;
}
// 设置复合分词等级 ($level = 0,15)
function set_multi($level)
{
$level = (intval($level) << 12);
$this->_mode &= ~PSCWS4_MULTI_MASK;
if ($level & PSCWS4_MULTI_MASK) $this->_mode |= $level;
}
// 设置是否显示分词调试信息
function set_debug($yes)
{
if ($yes == true) $this->_mode |= PSCWS4_DEBUG;
else $this->_mode &= ~PSCWS4_DEBUG;
}
// 设置是否自动将散字二元化
function set_duality($yes)
{
if ($yes == true) $this->_mode |= PSCWS4_DUALITY;
else $this->_mode &= ~PSCWS4_DUALITY;
}
// 设置要分词的文本字符串
function send_text($text)
{
$this->_txt = (string) $text;
$this->_len = strlen($this->_txt);
$this->_off = 0;
}
// 取回一批分词结果(需要多次调用, 直到返回 false)
function get_result()
{
$off = $this->_off;
$len = $this->_len;
$txt = $this->_txt;
$this->_res = array();
while (($off < $len) && (ord($txt[$off]) <= 0x20))
{
if ($txt[$off] == "\r" || $txt[$off] == "\n")
{
$this->_off = $off + 1;
$this->_put_res($off, 0, 1, 'un');
return $this->_res;
}
$off++;
}
if ($off >= $len) return false;
// try to parse the sentence
$this->_off = $off;
$ch = $txt[$off];
$cx = ord($ch);
if ($this->_char_token($ch))
{
$this->_off++;
$this->_put_res($off, 0, 1, 'un');
return $this->_res;
}
$clen = $this->_ztab[$cx];
$zlen = 1;
$pflag = ($clen > 1 ? PSCWS4_PFLAG_WITH_MB : ($this->_is_alnum($cx) ? PSCWS4_PFLAG_ALNUM : 0));
while (($off = ($off + $clen)) < $len)
{
$ch = $txt[$off];
$cx = ord($ch);
if ($cx <= 0x20 || $this->_char_token($ch)) break;
$clen = $this->_ztab[$cx];
if (!($pflag & PSCWS4_PFLAG_WITH_MB))
{
// pure single-byte -> multibyte (2bytes)
if ($clen == 1)
{
if (($pflag & PSCWS4_PFLAG_ALNUM) && !$this->_is_alnum($cx))
$pflag ^= PSCWS4_PFLAG_ALNUM;
}
else
{
if (!($pflag & PSCWS4_PFLAG_ALNUM) || $zlen > 2) break;
$pflag |= PSCWS4_PFLAG_WITH_MB;
}
}
else if (($pflag & PSCWS4_PFLAG_WITH_MB) && $clen == 1)
{
// mb + single-byte. allowd: alpha+num + 中文
if (!$this->_is_alnum($cx)) break;
$pflag &= ~PSCWS4_PFLAG_VALID;
for ($i = $off+1; $i < ($off+3); $i++)
{
$ch = $txt[$i];
$cx = ord($ch);
if (($i >= $len) || ($cx <= 0x20) || ($this->_ztab[$cx] > 1))
{
$pflag |= PSCWS4_PFLAG_VALID;
break;
}
if (!$this->_is_alnum($cx)) break;
}
if (!($pflag & PSCWS4_PFLAG_VALID)) break;
$clen += ($i - $off - 1);
}
// hightman.070813: add max zlen limit
if (++$zlen >= PSCWS4_MAX_ZLEN) break;
}
// hightman.070624: 处理半个字的问题
if (($ch = $off) > $len)
$off -= $clen;
// do the real segment
if ($off <= $this->_off) return false;
else if ($pflag & PSCWS4_PFLAG_WITH_MB) $this->_msegment($off, $zlen);
else if (!($pflag & PSCWS4_PFLAG_ALNUM) || (($off - $this->_off) >= PSCWS4_MAX_EWLEN)) $this->_ssegment($off);
else
{
$zlen = $off - $this->_off;
$this->_put_res($this->_off, 2.5*log($zlen), $zlen, 'en');
}
// reutrn the result
$this->_off = ($ch > $len ? $len : $off);
if (count($this->_res) == 0)
return $this->get_result();
return $this->_res;
}
// 取回频率和权重综合最大的前 N 个词
function get_tops($limit = 10, $xattr = '')
{
$ret = array();
if (!$this->_txt) return false;
$xmode = false;
$attrs = array();
if ($xattr != '')
{
if (substr($xattr, 0, 1) == '~')
{
$xattr = substr($xattr, 1);
$xmode = true;
}
foreach (explode(',', $xattr) as $tmp)
{
$tmp = strtolower(trim($tmp));
if (!empty($tmp)) $attrs[$tmp] = true;
}
}
// save the old offset
$off = $this->_off;
$this->_off = $cnt = 0;
$list = array();
while ($tmpa = $this->get_result())
{
foreach ($tmpa as $tmp)
{
if ($tmp['idf'] < 0.2 || substr($tmp['attr'], 0, 1) == '#') continue;
// check attr filter
if (count($attrs) > 0)
{
if ($xmode == true && !isset($attrs[$tmp['attr']])) continue;
if ($xmode == false && isset($attrs[$tmp['attr']])) continue;
}
// check stopwords
$word = strtolower($tmp['word']);
if ($this->_rule_checkbit($word, PSCWS4_RULE_NOSTATS)) continue;
// put to list
if (isset($list[$word]))
{
$list[$word]['weight'] += $tmp['idf'];
$list[$word]['times']++;
}
else
{
$list[$word] = array('word'=>$tmp['word'], 'times'=>1, 'weight'=>$tmp['idf'], 'attr'=>$tmp['attr']);
}
}
}
// restore the offset
$this->_off = $off;
// sort it & return
$cmp_func = create_function('$a,$b', 'return ($b[\'weight\'] > $a[\'weight\'] ? 1 : -1);');
usort($list, $cmp_func);
if (count($list) > $limit) $list = array_slice($list, 0, $limit);
return $list;
}
// 关闭释放资源
function close()
{
// free the dict
if ($this->_xd)
{
$this->_xd->Close();
$this->_xd = false;
}
// free the ruleset
$this->_rd = array();
$this->_rs = array();
}
// 版本
function version()
{
return sprintf('PSCWS/4.0 - by hightman');
}
////////////////////////////////////////////
// these are all private functions
////////////////////////////////////////////
function _rule_load($fpath)
{
if (!($fd = fopen($fpath, 'r'))) return false;
$this->_rs = array();
// quick scan to add the name to list
$i = $j = 0;
while ($buf = fgets($fd, 512))
{
if (substr($buf, 0, 1) != '[' || !($pos = strpos($buf, ']')))
continue;
if ($pos == 1 || $pos > 15) continue;
$key = strtolower(substr($buf, 1, $pos - 1));
if (isset($this->_rs[$key])) continue;
$item = array('tf'=>5.0, 'idf'=>3.5, 'attr'=>'un', 'bit'=>0, 'flag'=>0, 'zmin'=>0, 'zmax'=>0, 'inc'=>0, 'exc'=>0);
if ($key == 'special') $item['bit'] = PSCWS4_RULE_SPECIAL;
else if ($key == 'nostats') $item['bit'] = PSCWS4_RULE_NOSTATS;
else
{
$item['bit'] = (1<<$j);
$j++;
}
$this->_rs[$key] = $item;
if (++$i >= PSCWS4_RULE_MAX)
break;
}
// load the ruleset
rewind($fd);
$rbl = false;
unset($item);
while ($buf = fgets($fd, 512))
{
$ch = substr($buf, 0, 1);
if ($ch == ';') continue;
if ($ch == '[')
{
unset($item);
if (($pos = strpos($buf, ']')) > 1)
{
$key = strtolower(substr($buf, 1, $pos - 1));
if (isset($this->_rs[$key]))
{
$rbl = true; // defalut read by line = yes
$item = &$this->_rs[$key];
}
}
continue;
}
// param set: line|znum|include|exclude|type|tf|idf|attr */
if ($ch == ':')
{
$buf = substr($buf, 1);
if (!($pos = strpos($buf, '='))) continue;
list($pkey, $pval) = explode('=', $buf, 2);
$pkey = trim($pkey);
$pval = trim($pval);
if ($pkey == 'line') $rbl = (strtolower(substr($pval, 0, 1)) == 'n' ? false : true);
else if ($pkey == 'tf') $item['tf'] = floatval($pval);
else if ($pkey == 'idf') $item['idf'] = floatval($pval);
else if ($pkey == 'attr') $item['attr'] = $pval; // 2bytes?
else if ($pkey == 'znum')
{
if ($pos = strpos($pval, ','))
{
$item['zmax'] = intval(trim(substr($pval, $pos+1)));
$item['flag'] |= PSCWS4_ZRULE_RANGE;
$pval = substr($pval, 0, $pos);
}
$item['zmin'] = intval($pval);
}
else if ($pkey == 'type')
{
if ($pval == 'prefix') $item['flag'] |= PSCWS4_ZRULE_PREFIX;
if ($pval == 'suffix') $item['flag'] |= PSCWS4_ZRULE_SUFFIX;
}
else if ($pkey == 'include' || $pkey == 'exclude')
{
$clude = 0;
foreach (explode(',', $pval) as $tmp)
{
$tmp = strtolower(trim($tmp));
if (!isset($this->_rs[$tmp])) continue;
$clude |= $this->_rs[$tmp]['bit'];
}
if ($pkey == 'include')
{
$item['inc'] |= $clude;
$item['flag'] |= PSCWS4_ZRULE_INCLUDE;
}
else
{
$item['exc'] |= $clude;
$item['flag'] |= PSCWS4_ZRULE_EXCLUDE;
}
}
continue;
}
// read the entries
if (!isset($item)) continue;
$buf = trim($buf);
if (empty($buf)) continue;
// save the record
if ($rbl) $this->_rd[$buf] = &$item;
else
{
$len = strlen($buf);
for ($off = 0; $off < $len; )
{
$ord = ord(substr($buf, $off, 1));
$zlen = $this->_ztab[$ord];
if ($off + $zlen >= $len) break;
$zch = substr($buf, $off, $zlen);
$this->_rd[$zch] = &$item;
$off += $zlen;
}
}
}
}
// get the ruleset
function _rule_get($str)
{
if (!isset($this->_rd[$str])) return false;
return $this->_rd[$str];
}
// check the bit with str
function _rule_checkbit($str, $bit)
{
if (!isset($this->_rd[$str])) return false;
$bit2 = $this->_rd[$str]['bit'];
return ($bit & $bit2 ? true : false);
}
// check the rule include | exclude
function _rule_check($rule, $str)
{
if (($rule['flag'] & PSCWS4_ZRULE_INCLUDE) && !$this->_rule_checkbit($str, $rule['bit']))
return false;
if (($rule['flag'] & PSCWS4_ZRULE_EXCLUDE) && $this->_rule_checkbit($str, $rule['bit']))
return false;
return true;
}
// bulid res
function _put_res($o, $i, $l, $a)
{
$word = substr($this->_txt, $o, $l);
$item = array('word'=>$word, 'off'=>$o, 'idf'=>$i, 'len'=>$l, 'attr'=>$a);
$this->_res[] = $item;
}
// alpha, numeric check by ORD value
function _is_alnum($c)
{
return (($c>=48&&$c<=57)||($c>=65&&$c<=90)||($c>=97&&$c<=122));
}
function _is_alpha($c)
{
return (($c>=65&&$c<=90)||($c>=97&&$c<=122));
}
function _is_ualpha($c)
{
return ($c>=65&&$c<=90);
}
function _is_digit($c)
{
return ($c>=48&&$c<=57);
}
function _no_rule1($f)
{
return (($f & (PSCWS4_ZFLAG_SYMBOL|PSCWS4_ZFLAG_ENGLISH)) || (($f & (PSCWS4_ZFLAG_WHEAD|PSCWS4_ZFLAG_NR2)) == PSCWS4_ZFLAG_WHEAD));
}
function _no_rule2($f)
{
//return (($f & PSCWS4_ZFLAG_ENGLISH) || (($f & (PSCWS4_ZFLAG_WHEAD|PSCWS4_ZFLAG_N2)) == PSCWS4_ZFLAG_WHEAD));
return $this->_no_rule1($f);
}
function _char_token($c)
{
return ($c=='('||$c==')'||$c=='['||$c==']'||$c=='{'||$c=='}'||$c==':'||$c=='"');
}
// query the dict
function _dict_query($word)
{
if (!$this->_xd) return false;
$value = $this->_xd->Get($word);
if (!$value) return false;
$tmp = unpack('ftf/fidf/Cflag/a3attr', $value);
return $tmp;
}
// ssegment, 单字节用语切割
function _ssegment($end)
{
$start = $this->_off;
$wlen = $end - $start;
// check special words (need strtoupper)
if ($wlen > 1)
{
$txt = strtoupper(substr($this->_txt, $start, $wlen));
if ($this->_rule_checkbit($txt, PSCWS4_RULE_SPECIAL))
{
$this->_put_res($start, 9.5, $wlen, 'nz');
return;
}
}
$txt = $this->_txt;
// check brief words such as S.H.E M.R.
if ($this->_is_ualpha(ord($txt[$start])) && $txt[$start+1] == '.')
{
for ($ch = $start + 2; $ch < $end; $ch++)
{
if (!$this->_is_ualpha(ord($txt[$ch]))) break;
$ch++;
if ($ch == $end || $txt[$ch] != '.') break;
}
if ($ch == $end)
{
$this->_put_res($start, 7.5, $wlen, 'nz');
return;
}
}
// 取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的'
while ($start < $end)
{
$ch = $txt[$start++];
$cx = ord($ch);
if ($this->_is_alnum($cx))
{
$pflag = $this->_is_digit($cx) ? PSCWS4_PFLAG_DIGIT : 0;
$wlen = 1;
while ($start < $end)
{
$ch = $txt[$start];
$cx = ord($ch);
if ($pflag & PSCWS4_PFLAG_DIGIT)
{
if (!$this->_is_digit($cx))
{
if (($pflag & PSCWS4_PFLAG_ADDSYM) || $cx != 0x2e || !$this->_is_digit(ord($txt[$start+1])))
break;
$pflag |= PSCWS4_PFLAG_ADDSYM;
}
}
else
{
if (!$this->_is_alpha($cx))
{
if (($pflag & PSCWS4_PFLAG_ADDSYM) || $cx != 0x27 || !$this->_is_alpha(ord($txt[$start+1])))
break;
$pflag |= PSCWS4_PFLAG_ADDSYM;
}
}
$start++;
if (++$wlen >= PSCWS4_MAX_EWLEN) break;
}
$this->_put_res($start - $wlen, 2.5*log($wlen), $wlen, 'en');
}
else if (!($this->_mode & PSCWS4_IGN_SYMBOL))
{
$this->_put_res($start-1, 0, 1, 'un');
}
}
}
// get one z by ZMAP
function _get_zs($i, $j = -1)
{
if ($j == -1) $j = $i;
return substr($this->_txt, $this->_zmap[$i]['start'], $this->_zmap[$j]['end'] - $this->_zmap[$i]['start']);
}
// mget_word
function _mget_word($i, $j)
{
$wmap = $this->_wmap;
if (!($wmap[$i][$i]['flag'] & PSCWS4_ZFLAG_WHEAD)) return $i;
for ($r = $i, $k = $i+1; $k <= $j; $k++)
{
if ($wmap[$i][$k] && ($wmap[$i][$k]['flag'] & PSCWS4_WORD_FULL)) $r = $k;
}
return $r;
}
// mset_word
function _mset_word($i, $j)
{
$wmap = $this->_wmap;
$zmap = $this->_zmap;
$item = $wmap[$i][$j];
// hightman.070705: 加入 item == null 判断, 防止超长词(255字以上)unsigned char溢出
if (($item == false) || (($this->_mode & PSCWS4_IGN_SYMBOL)
&& !($item['flag'] & PSCWS4_ZFLAG_ENGLISH) && $item['attr'] == 'un'))
{
return;
}
// hightman.070701: 散字自动二元聚合
if ($this->_mode & PSCWS4_DUALITY)
{
$k = $this->_zis;
if ($i == $j && !($item['flag'] & PSCWS4_ZFLAG_ENGLISH) && $item['attr'] == 'un')
{
$this->_zis = $i;
if ($k < 0) return;
$i = ($k & ~PSCWS4_ZIS_USED);
if (($i != ($j-1)) || (!($k & PSCWS4_ZIS_USED) && $this->_wend == $i))
{
$this->_put_res($zmap[$i]['start'], $wmap[$i][$i]['idf'], $zmap[$i]['end'] - $zmap[$i]['start'], $wmap[$i][$i]['attr']);
if ($i != ($j-1)) return;
}
$this->_zis |= PSCWS4_ZIS_USED;
}
else
{
if (($k >= 0) && (!($k & PSCWS4_ZIS_USED) || ($j > $i)))
{
$k &= ~PSCWS4_ZIS_USED;
$this->_put_res($zmap[$k]['start'], $wmap[$k][$k]['idf'], $zmap[$k]['end'] - $zmap[$k]['start'], $wmap[$k][$k]['attr']);
}
if ($j > $i) $this->_wend = $j + 1;
$this->_zis = -1;
}
}
// save the res
$this->_put_res($zmap[$i]['start'], $item['idf'], $zmap[$j]['end'] - $zmap[$i]['start'], $item['attr']);
// hightman.070902: multi segment
// step1: split to short words
if (($j-$i) > 1)
{
$m = $i;
if ($this->_mode & PSCWS4_MULTI_SHORT)
{
while ($m < $j)
{
$k = $m;
for ($n = $m + 1; $n <= $j; $n++)
{
if ($n == $j && $m == $i) break;
$item = $wmap[$m][$n];
if ($item && ($item['flag'] & PSCWS4_WORD_FULL))
{
$k = $n;
$this->_put_res($zmap[$m]['start'], $item['idf'], $zmap[$n]['end'] - $zmap[$m]['start'], $item['attr']);
if (!($item['flag'] & PSCWS4_WORD_PART)) break;
}
}
if ($k == $m)
{
if ($m == $i) break;
$item = $wmap[$m][$m];
$this->_put_res($zmap[$m]['start'], $item['idf'], $zmap[$m]['end'] - $zmap[$m]['start'], $item['attr']);
}
if (($m = ($k+1)) == $j)
{
$m--;
break;
}
}
}
if ($this->_mode & PSCWS4_MULTI_DUALITY)
{
while ($m < $j)
{
$this->_put_res($zmap[$m]['start'], $wmap[$m][$m]['idf'], $zmap[$m+1]['end'] - $zmap[$m]['start'], $wmap[$m][$m]['attr']);
$m++;
}
}
}
// step2, split to single char
if (($j > $i) && ($this->_mode & (PSCWS4_MULTI_ZMAIN|PSCWS4_MULTI_ZALL)))
{
if (($j - $i) == 1 && !$wmap[$i][$j])
{
if ($wmap[$i][$i]['flag'] & PSCWS4_ZFLAG_PUT) $i++;
else $wmap[$i][$i]['flag'] |= PSCWS4_ZFLAG_PUT;
$wmap[$j][$j]['flag'] |= PSCWS4_ZFLAG_PUT;
}
do
{
if ($wmap[$i][$i]['flag'] & PSCWS4_ZFLAG_PUT) continue;
if (!($this->_mode & PSCWS4_MULTI_ZALL) && !strchr("jnv", substr($wmap[$i][$i]['attr'],0,1))) continue;
$this->_put_res($zmap[$i]['start'], $wmap[$i][$i]['idf'], $zmap[$i]['end'] - $zmap[$i]['start'], $wmap[$i][$i]['attr']);
}
while (++$i <= $j);
}
}
// mseg_zone
function _mseg_zone($f, $t)
{
$weight = $nweight = 0.0;
$wmap = &$this->_wmap;
$zmap = $this->_zmap;
$mpath = $npath = false;
for ($x = $i = $f; $i <= $t; $i++)
{
$j = $this->_mget_word($i, $t);
if ($j == $i || $j <= $x || (/* $i > $x && */($wmap[$i][$j]['flag'] & PSCWS4_WORD_USED))) continue;
// one word only
if ($i == $f && $j == $t)
{
$mpath = array($j - $i, 0xff);
break;
}
if ($i != $f && ($wmap[$i][$j]['flag'] & PSCWS4_WORD_RULE)) continue;
// create the new path
$wmap[$i][$j]['flag'] |= PSCWS4_WORD_USED;
$nweight = $wmap[$i][$j]['tf'] * ($j - $i + 1);
if ($i == $f) $nweight *= 1.2;
else if ($j == $t) $nweight *= 1.4;
// create the npath
if ($npath == false)
$npath = array_fill(0, $t-$f+2, 0xff);
// lookfor backward
$x = 0;
for ($m = $f; $m < $i; $m = $n+1)
{
$n = $this->_mget_word($m, $i-1);
$nweight *= $wmap[$m][$n]['tf'] * ($n-$m+1);
$npath[$x++] = $n - $m;
if ($n > $m) $wmap[$m][$n]['flag'] |= PSCWS4_WORD_USED;
}
// my self
$npath[$x++] = $j - $i;
// lookfor forward
for ($m = $j+1; $m <= $t; $m = $n+1)
{
$n = $this->_mget_word($m, $t);
$nweight *= $wmap[$m][$n]['tf'] * ($n-$m+1);
$npath[$x++] = $n - $m;
if ($n > $m) $wmap[$m][$n]['flag'] |= PSCWS4_WORD_USED;
}
$npath[$x] = 0xff;
$nweight /= pow($x-1,4);
// draw the path for debug
if ($this->_mode & PSCWS4_DEBUG)
{
printf("PATH by keyword = %s, (weight=%.4f):\n", $this->_get_zs($i, $j), $nweight);
for ($x = 0, $m = $f; ($n = $npath[$x]) != 0xff; $x++)
{
$n += $m;
echo $this->_get_zs($m, $n) . " ";
$m = $n + 1;
}
echo "\n--\n";
}
$x = $j;
// check better path
if ($nweight > $weight)
{
$weight = $nweight;
$swap = $mpath;
$mpath = $npath;
$npath = $swap;
unset($swap);
}
}
// set the result, mpath != NULL
if ($mpath == false) return;
for ($x = 0, $m = $f; ($n = $mpath[$x]) != 0xff; $x++)
{
$n += $m;
$this->_mset_word($m, $n);
$m = $n + 1;
}
}
// msegment(重点函数)
function _msegment($end, $zlen)
{
$this->_wmap = array_fill(0, $zlen, array_fill(0, $zlen, false));
$this->_zmap = array_fill(0, $zlen, false);
$wmap = &$this->_wmap;
$zmap = &$this->_zmap;
$txt = $this->_txt;
$start = $this->_off;
$this->_zis = -1;
// load the zmap
for ($i = 0; $start < $end; $i++)
{
$ch = $txt[$start];
$cx = ord($ch);
$clen = $this->_ztab[$cx];
if ($clen == 1)
{
while ($start++ < $end)
{
$cx = ord($txt[$start]);
if ($this->_ztab[$cx] > 1) break;
$clen++;
}
$wmap[$i][$i] = array('tf'=>0.5, 'idf'=>0, 'flag'=>PSCWS4_ZFLAG_ENGLISH, 'attr'=>'un');
}
else
{
$query = $this->_dict_query(substr($txt, $start, $clen));
if (!$query) $wmap[$i][$i] = array('tf'=>0.5, 'idf'=>0, 'flag'=>0, 'attr'=>'un');
else
{
if (substr($query['attr'],0,1) == '#') $query['flag'] |= PSCWS4_ZFLAG_SYMBOL;
$wmap[$i][$i] = $query;
}
$start += $clen;
}
$zmap[$i] = array('start'=>$start-$clen, 'end'=>$start);
}
// fixed real zlength
$zlen = $i;
// create word query table
for ($i = 0; $i < $zlen; $i++)
{
$k = 0;
for ($j = $i+1; $j < $zlen; $j++)
{
$query = $this->_dict_query($this->_get_zs($i, $j));
if (!$query) break;
$ch = $query['flag'];
if ($ch & PSCWS4_WORD_FULL)
{
$wmap[$i][$j] = $query;
$wmap[$i][$i]['flag'] |= PSCWS4_ZFLAG_WHEAD;
for ($k = $i+1; $k <= $j; $k++) $wmap[$k][$k]['flag'] |= PSCWS4_ZFLAG_WPART;
}
if (!($ch & PSCWS4_WORD_PART)) break;
}
if ($k--)
{
// set nr2 to some short name
if ($k == ($i+1))
{
if ($wmap[$i][$k]['attr'] == 'nr') $wmap[$i][$i]['flag'] |= PSCWS4_ZFLAG_NR2;
//if (substr($wmap[$i][$k]['attr'], 0, 1) == 'n') $wmap[$i][$i]['flag'] |= PSCWS4_ZFLAG_N2;
}
// clean the PART flag for the last word
if ($k < $j) $wmap[$i][$k]['flag'] ^= PSCWS4_WORD_PART;
}
}
// try to do the ruleset match
// for name & zone & chinese numeric
if (count($this->_rd) > 0)
{
// check for 'one word'
for ($i = 0; $i < $zlen; $i++)
{
if ($this->_no_rule1($wmap[$i][$i]['flag'])) continue;
$r1 = $this->_rule_get($this->_get_zs($i));
if (!$r1) continue;
$clen = ($r1['zmin'] > 0 ? $r1['zmin'] : 1);
if (($r1['flag'] & PSCWS4_ZRULE_PREFIX) && ($i < ($zlen - $clen)))
{
// prefix, check after (zmin~zmax)
// 先检查 zmin 字内是否全部符合要求, 再在 zmax 范围内取得符合要求的字
for ($ch = 1; $ch <= $clen; $ch++)
{
$j = $i + $ch;
if ($j >= $zlen || $this->_no_rule2($wmap[$j][$j]['flag'])) break;
if (!$this->_rule_check($r1, $this->_get_zs($j))) break;
}
if ($ch <= $clen) continue;
// no limit znum or limit to a range
$j = $i + $ch;
while (true)
{
if ((!$r1['zmax'] && $r1['zmin']) || ($r1['zmax'] && ($clen >= $r1['zmax']))) break;
if ($j >= $zlen || $this->_no_rule2($wmap[$j][$j]['flag'])) break;
if (!$this->_rule_check($r1, $this->_get_zs($j))) break;
$clen++;
$j++;
}
// 注意原来2字人名,识别后仍为2字的情况
if ($wmap[$i][$i]['flag'] & PSCWS4_ZFLAG_NR2)
{
if ($clen == 1) continue;
$wmap[$i][$i+1]['flag'] |= PSCWS4_WORD_PART;
}
// ok, got: i & clen
$k = $i + $clen;
$wmap[$i][$k] = array('tf'=>$r1['tf'], 'idf'=>$r1['idf'], 'flag'=>(PSCWS4_WORD_RULE|PSCWS4_WORD_FULL), 'attr'=>$r1['attr']);
$wmap[$i][$i]['flag'] |= PSCWS4_ZFLAG_WHEAD;
for ($j = $i+1; $j <= $k; $j++) $wmap[$j][$j]['flag'] |= PSCWS4_ZFLAG_WPART;
if (!($wmap[$i][$i]['flag'] & PSCWS4_ZFLAG_WPART)) $i = $k;
continue;
}
if (($r1['flag'] & PSCWS4_ZRULE_SUFFIX) && ($i >= $clen))
{
// suffix, check before
for ($ch = 1; $ch <= $clen; $ch++)
{
$j = $i - $ch;
if ($j < 0 || $this->_no_rule2($wmap[$j][$j]['flag'])) break;
if (!$this->_rule_check($r1, $this->_get_zs($j))) break;
}
if ($ch <= $clen) continue;
// no limit znum or limit to a range
$j = $i - $ch;
while (true)
{
if ((!$r1['zmax'] && $r1['zmin']) || ($r1['zmax'] && ($clen >= $r1['zmax']))) break;
if ($j < 0 || $this->_no_rule2($wmap[$j][$j]['flag'])) break;
if (!$this->_rule_check($r1, $this->_get_zs($j))) break;
$clen++;
$j--;
}
// ok, got: i & clen (maybe clen=1 & [k][i] isset)
$k = $i - $clen;
if ($wmap[$k][$i] != false) continue;
$wmap[$k][$i] = array('tf'=>$r1['tf'], 'idf'=>$r1['idf'], 'flag'=>PSCWS4_WORD_FULL, 'attr'=>$r1['attr']);
$wmap[$k][$k]['flag'] |= PSCWS4_ZFLAG_WHEAD;
for ($j = $k+1; $j <= $i; $j++)
{
$wmap[$j][$j]['flag'] |= PSCWS4_ZFLAG_WPART;
if (($j != $i) && ($wmap[$k][$j] != false)) $wmap[$k][$j]['flag'] |= PSCWS4_WORD_PART;
}
continue;
}
}
// check for 'two words' (such as: 欧阳** , **西路)
for ($i = $zlen - 2; $i >= 0; $i--)
{
// with value ==> must be have SCWS_WORD_FULL, so needn't check it ag.
if (($wmap[$i][$i+1] == false) || ($wmap[$i][$i+1]['flag'] & PSCWS4_WORD_PART)) continue;
$k = $i+1;
$r1 = $this->_rule_get($this->_get_zs($i, $k));
if (!$r1) continue;
$clen = $r1['zmin'] > 0 ? $r1['zmin'] : 1;
if (($r1['flag'] & PSCWS4_ZRULE_PREFIX) && ($k < ($zlen - $clen)))
{
for ($ch = 1; $ch <= $clen; $ch++)
{
$j = $k + $ch;
if ($j >= $zlen || $this->_no_rule2($wmap[$j][$j]['flag'])) break;
if (!$this->_rule_check($r1, $this->_get_zs($j))) break;
}
if ($ch <= $clen) continue;
// no limit znum or limit to a range
$j = $k + $ch;
while (true)
{
if ((!$r1['zmax'] && $r1['zmin']) || ($r1['zmax'] && ($clen >= $r1['zmax']))) break;
if ($j >= $zlen || $this->_no_rule2($wmap[$j][$j]['flag'])) break;
if (!$this->_rule_check($r1, $this->_get_zs($j))) break;
$clen++;
$j++;
}
// ok, got: i & clen
$k = $k + $clen;
$wmap[$i][$k] = array('tf'=>$r1['tf'], 'idf'=>$r1['idf'], 'flag'=>PSCWS4_WORD_FULL, 'attr'=>$r1['attr']);
$wmap[$i][$i+1]['flag'] |= PSCWS4_WORD_PART;
for ($j = $i+2; $j <= $k; $j++) $wmap[$j][$j]['flag'] |= PSCWS4_ZFLAG_WPART;
$i--;
continue;
}
if (($r1['flag'] & PSCWS4_ZRULE_SUFFIX) && ($i >= $clen))
{
// suffix, check before
for ($ch = 1; $ch <= $clen; $ch++)
{
$j = $i - $ch;
if ($j < 0 || $this->_no_rule2($wmap[$j][$j]['flag'])) break;
if (!$this->_rule_check($r1, $this->_get_zs($j))) break;
}
if ($ch <= $clen) continue;
// no limit znum or limit to a range
$j = $i - $ch;
while (true)
{
if ((!$r1['zmax'] && $r1['zmin']) || ($r1['zmax'] && ($clen >= $r1['zmax']))) break;
if ($j < 0 || $this->_no_rule2($wmap[$j][$j]['flag'])) break;
if (!$this->_rule_check($r1, $this->_get_zs($j))) break;
$clen++;
$j--;
}
// ok, got: i & clen (maybe clen=1 & [k][i] isset)
$k = $i - $clen;
$i = $i + 1;
$wmap[$k][$i] = array('tf'=>$r1['tf'], 'idf'=>$r1['idf'], 'flag'=>PSCWS4_WORD_FULL, 'attr'=>$r1['attr']);
$wmap[$k][$k]['flag'] |= PSCWS4_ZFLAG_WHEAD;
for ($j = $k+1; $j <= $i; $j++)
{
$wmap[$j][$j]['flag'] |= PSCWS4_ZFLAG_WPART;
if ($wmap[$k][$j] != false) $wmap[$k][$j]['flag'] |= PSCWS4_WORD_PART;
}
$i -= ($clen+1);
continue;
}
}
}
// do the segment really
// find the easy break point
for ($i = 0, $j = 0; $i < $zlen; $i++)
{
if ($wmap[$i][$i]['flag'] & PSCWS4_ZFLAG_WPART) continue;
if ($i > $j) $this->_mseg_zone($j, $i-1);
$j = $i;
if (!($wmap[$i][$i]['flag'] & PSCWS4_ZFLAG_WHEAD))
{
$this->_mset_word($i, $i);
$j++;
}
}
// the lastest zone
if ($i > $j) $this->_mseg_zone($j, $i-1);
// the last single for duality
if (($this->_mode & PSCWS4_DUALITY) && ($this->_zis >= 0) && !($this->_zis & PSCWS4_ZIS_USED))
{
$i = $this->_zis;
$this->_put_res($zmap[$i]['start'], $wmap[$i][$i]['idf'], $zmap[$i]['end'] - $zmap[$i]['start'], $wmap[$i][$i]['attr']);
}
}
}
?>
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/no2key/scws4.git
[email protected]:no2key/scws4.git
no2key
scws4
scws4
master

搜索帮助