以前写的一个小东西,当正向最大匹配无法切分开时,就用反向最大匹配来切分,需要读入一个拼音文件
程序:
# !/Perl/bin -w
use strict;
# the spliter for pinyin string
my %dict;
my $maxlength=0;
my $py;
open(DICT,"pinyin.txt")||die;
while(my $line=<DICT>){
chomp $line;
$dict{$line}=length($line);
$maxlength=length($line) if(length($line)>$maxlength);
}
close(DICT);
print "please input the Pin Yin string to be segmented(quit to quit)\n";
while(){
$py=<STDIN>;
chomp $py;
exit if($py eq 'quit');
my $result;
$result=spliter($py);
print "$result\n";
}
sub spliter{
my ($Input)=@_;#caution the()
my $Segemted;
my $Remained=$Input;
while ( length($Remained) > 0 ){
my $Match=0;
my $i;
for($i=$maxlength;$i>=1;$i--){
my $index=index($Remained," ");
if($index==0) {$Remained=substr($Remained,1);}
my $MatchString=substr($Remained,0,$i);
if ( defined $dict{$MatchString} ){
$Segemted.=$MatchString;
$Segemted.=' ';
$Remained=substr($Remained,$i,length($Remained)-$i);
$Match=1;
last;
}#ednif
}#endfor
if($Match==0){#if can't be matched in regular,reverse match it
$Segemted=reversesplit($Input);
last;#to end while
}
}#endwhile
return $Segemted;
}
#reverse maxmatch
sub reversesplit{
my ($Input)=@_;#caution the()
my $Segemted;
my $Remained=$Input;
while ( length($Remained) > 0 ){
my $Match=0;
my $i;
for($i=$maxlength;$i>=1;$i--){
my $index=index($Remained," ");
if($index==(length($Remained)-1)) {$Remained=substr($Remained,0,length($Remained)-1);}
my $MatchString=substr($Remained,-$i);
if ( defined $dict{$MatchString} ){
$Segemted.=$MatchString;
$Segemted.=' ';
$Remained=substr($Remained,0,-$i);
$Match=1;
last;
}#ednif
}#endfor
}#endwhile
$Segemted=substr($Segemted,0,length($Segemted)-1);
my @seg=split(" ",$Segemted);
$Segemted=join(" ",reverse @seg);
return $Segemted;
}
拼音文件:
a
ai
an
ang
ao
ba
bai
ban
bang
bao
bei
ben
beng
bi
bian
biao
bie
bin
bing
bo
bu
ca
cai
can
cang
cao
ce
cen
ceng
cha
chai
chan
chang
chao
che
chen
cheng
chi
chong
chou
chu
chuai
chuan
chuang
chui
chun
chuo
ci
cong
cou
cu
cuan
cui
cun
cuo
da
dai
dan
dang
dao
de
dei
deng
di
dia
dian
diao
die
ding
diu
dong
dou
du
duan
dui
dun
duo
e
ei
en
er
fa
fan
fang
fei
fen
feng
fo
fou
fu
ga
gai
gan
gang
gao
ge
gei
gen
geng
gong
gou
gu
gua
guai
guan
guang
gui
gun
guo
ha
hai
han
hang
hao
he
hei
hen
heng
hng
hong
hou
hu
hua
huai
huan
huang
hui
hun
huo
ji
jia
jian
jiang
jiao
jie
jin
jing
jiong
jiu
ju
juan
jue
jun
ka
kai
kan
kang
kao
ke
ken
keng
kong
kou
ku
kua
kuai
kuan
kuang
kui
kun
kuo
la
lai
lan
lang
lao
le
lei
leng
li
lia
lian
liang
liao
lie
lin
ling
liu
lo
long
lou
lu
luan
lun
luo
lv
lve
m
ma
mai
man
mang
mao
me
mei
men
meng
mi
mian
miao
mie
min
ming
miu
mo
mou
mu
n
na
nai
nan
nang
nao
ne
nei
nen
neng
ng
ni
nian
niang
niao
nie
nin
ning
niu
nong
nou
nu
nuan
nuo
nv
nve
o
ou
pa
pai
pan
pang
pao
pei
pen
peng
pi
pian
piao
pie
pin
ping
po
pou
pu
qi
qia
qian
qiang
qiao
qie
qin
qing
qiong
qiu
qu
quan
que
qun
ran
rang
rao
re
ren
reng
ri
rong
rou
ru
ruan
rui
run
ruo
sa
sai
san
sang
sao
se
sen
seng
sha
shai
shan
shang
shao
she
shei
shen
sheng
shi
shou
shu
shua
shuai
shuan
shuang
shui
shun
shuo
si
song
sou
su
suan
sui
sun
suo
ta
tai
tan
tang
tao
te
tei
teng
ti
tian
tiao
tie
ting
tong
tou
tu
tuan
tui
tun
tuo
wa
wai
wan
wang
wei
wen
weng
wo
wu
xi
xia
xian
xiang
xiao
xie
xin
xing
xiong
xiu
xu
xuan
xue
xun
ya
yan
yang
yao
ye
yi
yin
ying
yo
yong
you
yu
yuan
yue
yun
za
zai
zan
zang
zao
ze
zei
zen
zeng
zha
zhai
zhan
zhang
zhao
zhe
zhei
zhen
zheng
zhi
zhong
zhou
zhu
zhua
zhuai
zhuan
zhuang
zhui
zhun
zhuo
zi
zong
zou
zu
zuan
zui
zun
zuo
posted on 2006-04-19 09:11
Thunder 阅读(2040)
评论(0) 编辑 收藏 引用