python正向最大匹配分词和逆向最大匹配分词的实例

作者:yan456jie 时间:2021-11-24 22:39:58 

正向最大匹配


# -*- coding:utf-8 -*-

CODEC='utf-8'

def u(s, encoding):
 'converted other encoding to unicode encoding'
 if isinstance(s, unicode):
   return s
 else:
   return unicode(s, encoding)

def fwd_mm_seg(wordDict, maxLen, str):
 'forward max match segment'
 wordList = []
 segStr = str
 segStrLen = len(segStr)
 for word in wordDict:
   print 'word: ', word
 print "\n"
 while segStrLen > 0:
   if segStrLen > maxLen:
     wordLen = maxLen
   else:
     wordLen = segStrLen
   subStr = segStr[0:wordLen]
   print "subStr: ", subStr
   while wordLen > 1:
     if subStr in wordDict:
       print "subStr1: %r" % subStr
       break
     else:
       print "subStr2: %r" % subStr
       wordLen = wordLen - 1
       subStr = subStr[0:wordLen]
#      print "subStr3: ", subStr
   wordList.append(subStr)
   segStr = segStr[wordLen:]
   segStrLen = segStrLen - wordLen
 for wordstr in wordList:
   print "wordstr: ", wordstr
 return wordList

def main():
 fp_dict = open('words.dic')
 wordDict = {}
 for eachWord in fp_dict:
   wordDict[u(eachWord.strip(), 'utf-8')] = 1
 segStr = u'你好世界hello world'
 print segStr
 wordList = fwd_mm_seg(wordDict, 10, segStr)
 print "==".join(wordList)

if __name__ == '__main__':
 main()

逆向最大匹配


# -*- coding:utf-8 -*-

def u(s, encoding):
 'converted other encoding to unicode encoding'
 if isinstance(s, unicode):
   return s
 else:
   return unicode(s, encoding)

CODEC='utf-8'

def bwd_mm_seg(wordDict, maxLen, str):
 'forward max match segment'
 wordList = []
 segStr = str
 segStrLen = len(segStr)
 for word in wordDict:
   print 'word: ', word
 print "\n"
 while segStrLen > 0:
   if segStrLen > maxLen:
     wordLen = maxLen
   else:
     wordLen = segStrLen
   subStr = segStr[-wordLen:None]
   print "subStr: ", subStr
   while wordLen > 1:
     if subStr in wordDict:
       print "subStr1: %r" % subStr
       break
     else:
       print "subStr2: %r" % subStr
       wordLen = wordLen - 1
       subStr = subStr[-wordLen:None]
#      print "subStr3: ", subStr
   wordList.append(subStr)
   segStr = segStr[0: -wordLen]
   segStrLen = segStrLen - wordLen
 wordList.reverse()
 for wordstr in wordList:
   print "wordstr: ", wordstr
 return wordList

def main():
 fp_dict = open('words.dic')
 wordDict = {}
 for eachWord in fp_dict:
   wordDict[u(eachWord.strip(), 'utf-8')] = 1
 segStr = ur'你好世界hello world'
 print segStr
 wordList = bwd_mm_seg(wordDict, 10, segStr)
 print "==".join(wordList)

if __name__ == '__main__':
 main()

来源:https://blog.csdn.net/Yan456jie/article/details/78788783

标签:python,正向,逆向,分词
0
投稿

猜你喜欢

  • Pytorch 数据加载与数据预处理方式

    2021-06-12 11:07:05
  • ASP和SQL结合处理时间应用

    2008-06-09 15:15:00
  • 下拉列表两级连动的新方法(二)

    2009-06-04 18:22:00
  • django实现登录时候输入密码错误5次锁定用户十分钟

    2023-04-17 14:48:57
  • python微信跳一跳系列之自动计算跳一跳距离

    2021-08-01 14:18:36
  • Python WSGI的深入理解

    2021-04-20 21:48:25
  • 在线HTML编辑器原理(eweb原理)

    2009-01-08 12:25:00
  • Python连接字符串过程详解

    2022-12-09 23:49:45
  • py中的目录与文件判别代码

    2023-06-01 03:32:06
  • Python调用API接口实现人脸识别

    2022-09-10 22:42:20
  • Golang使用lua脚本实现redis原子操作

    2023-09-03 05:55:20
  • 详解Python中的Descriptor描述符类

    2021-10-16 10:10:35
  • python入门学习笔记分享

    2023-01-29 17:46:16
  • python super的使用方法及实例详解

    2023-08-19 11:04:49
  • 如何用python给数据加上高斯噪声

    2023-06-13 18:18:04
  • 5个css布局的常见问题及解决方法

    2009-11-19 13:21:00
  • Python中全局变量和局部变量的理解与区别

    2022-11-13 15:35:14
  • 详解python读写json文件

    2022-11-01 16:18:53
  • Pycharm使用时会出现的问题之cv2无法安装解决

    2022-12-26 06:24:49
  • 如何编写一个高效的国税系统通讯录数据库?

    2009-11-07 18:53:00
  • asp之家 网络编程 m.aspxhome.com