python正向最大匹配分词和逆向最大匹配分词的实例
作者:yan456jie 时间:2021-11-24 22:39:58
正向最大匹配
# -*- coding:utf-8 -*-
CODEC='utf-8'
def u(s, encoding):
'converted other encoding to unicode encoding'
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
def fwd_mm_seg(wordDict, maxLen, str):
'forward max match segment'
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
print 'word: ', word
print "\n"
while segStrLen > 0:
if segStrLen > maxLen:
wordLen = maxLen
else:
wordLen = segStrLen
subStr = segStr[0:wordLen]
print "subStr: ", subStr
while wordLen > 1:
if subStr in wordDict:
print "subStr1: %r" % subStr
break
else:
print "subStr2: %r" % subStr
wordLen = wordLen - 1
subStr = subStr[0:wordLen]
# print "subStr3: ", subStr
wordList.append(subStr)
segStr = segStr[wordLen:]
segStrLen = segStrLen - wordLen
for wordstr in wordList:
print "wordstr: ", wordstr
return wordList
def main():
fp_dict = open('words.dic')
wordDict = {}
for eachWord in fp_dict:
wordDict[u(eachWord.strip(), 'utf-8')] = 1
segStr = u'你好世界hello world'
print segStr
wordList = fwd_mm_seg(wordDict, 10, segStr)
print "==".join(wordList)
if __name__ == '__main__':
main()
逆向最大匹配
# -*- coding:utf-8 -*-
def u(s, encoding):
'converted other encoding to unicode encoding'
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
CODEC='utf-8'
def bwd_mm_seg(wordDict, maxLen, str):
'forward max match segment'
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
print 'word: ', word
print "\n"
while segStrLen > 0:
if segStrLen > maxLen:
wordLen = maxLen
else:
wordLen = segStrLen
subStr = segStr[-wordLen:None]
print "subStr: ", subStr
while wordLen > 1:
if subStr in wordDict:
print "subStr1: %r" % subStr
break
else:
print "subStr2: %r" % subStr
wordLen = wordLen - 1
subStr = subStr[-wordLen:None]
# print "subStr3: ", subStr
wordList.append(subStr)
segStr = segStr[0: -wordLen]
segStrLen = segStrLen - wordLen
wordList.reverse()
for wordstr in wordList:
print "wordstr: ", wordstr
return wordList
def main():
fp_dict = open('words.dic')
wordDict = {}
for eachWord in fp_dict:
wordDict[u(eachWord.strip(), 'utf-8')] = 1
segStr = ur'你好世界hello world'
print segStr
wordList = bwd_mm_seg(wordDict, 10, segStr)
print "==".join(wordList)
if __name__ == '__main__':
main()
来源:https://blog.csdn.net/Yan456jie/article/details/78788783
标签:python,正向,逆向,分词
0
投稿
猜你喜欢
Pytorch 数据加载与数据预处理方式
2021-06-12 11:07:05
ASP和SQL结合处理时间应用
2008-06-09 15:15:00
下拉列表两级连动的新方法(二)
2009-06-04 18:22:00
django实现登录时候输入密码错误5次锁定用户十分钟
2023-04-17 14:48:57
python微信跳一跳系列之自动计算跳一跳距离
2021-08-01 14:18:36
Python WSGI的深入理解
2021-04-20 21:48:25
在线HTML编辑器原理(eweb原理)
2009-01-08 12:25:00
Python连接字符串过程详解
2022-12-09 23:49:45
py中的目录与文件判别代码
2023-06-01 03:32:06
Python调用API接口实现人脸识别
2022-09-10 22:42:20
Golang使用lua脚本实现redis原子操作
2023-09-03 05:55:20
详解Python中的Descriptor描述符类
2021-10-16 10:10:35
python入门学习笔记分享
2023-01-29 17:46:16
python super的使用方法及实例详解
2023-08-19 11:04:49
如何用python给数据加上高斯噪声
2023-06-13 18:18:04
5个css布局的常见问题及解决方法
2009-11-19 13:21:00
Python中全局变量和局部变量的理解与区别
2022-11-13 15:35:14
详解python读写json文件
2022-11-01 16:18:53
Pycharm使用时会出现的问题之cv2无法安装解决
2022-12-26 06:24:49
如何编写一个高效的国税系统通讯录数据库?
2009-11-07 18:53:00