from collections import *
d_a = defaultdict(list)
d_b = defaultdict(list)
cnt = 0 # split出错
with open("../pos.txt", "r") as f:
for line in f:
line = line.strip()
pre = "#" #默认,作为句子开头,使第一个单词的词性标注更加可靠
for word in line.split():
try:
a, b = word.split("/")
d_a[pre].append(b)
pre = b
except:
cnt += 1
d_b[a].append(b)
# A 为状态转移矩阵
pro_a = {}
for pre, cx in d_a.items():
a = set(cx)
if len(a) == 1:
pro_a[pre] = {cx[0]:1}
else:
tmp = Counter(cx)
pro_a[pre] = {}
for x in a:
pro_a[pre][x] = tmp[x]/len(cx)
#print(pro_a)
# for pre, cx in pro_a.items():
# print(sum(cx.values()))
# B 为每个隐藏状态的可见状态的概率分布
# 每个单词的词性概率字典
pro_b = {}
for word, cx in d_b.items():
a = set(cx)
if len(a) == 1:
pro_b[word] = {cx[0]:1}
else:
tmp = Counter(cx)
pro_b[word] = {}
for x in a:
pro_b[word][x] = tmp[x]/len(cx)
#print(pro_b)
test_data = "The time that my journey takes is long and the way of it long"
#以一个单词的标注来举例分析:
#初始为`#`的概率为1
#当前的转移概率表: pro_a["#"],以此枚举x找出最大值, [pro_a["#"][x]*pro_b[word][x] for x in pro_b[word].keys()]
#将最大可能的词性加入 res
#以第二个单词的标注来举例分析:
# 第一个单词的词性概率表:pre = [......]
# 当前的转移概率表:pro_a, 当前词性概率表:pro_b[word],枚举x找出最大值:
# [sum(p[pre_x]*pro_a[pre_x][x] for pre_x in ...)*pro_b[word][x] for x in pro_b[word].keys()]
#然后将得到的词性概率存下来,留给下一个单词分析用
def verterbi(data):
res = []
data = data.split()
first = data[0]
# 处理第一个单词
pre = {}
max_p, cur = float("-inf"), None
for x in pro_b[first].keys():
p = pro_a["#"][x]*pro_b[first][x]
pre[x] = p
if p > max_p:
max_p = p
cur = x
res.append(cur)
# 处理之后的单词
for word in data[1:]:
#print(pro_b[word])
new_pre = {}
if len(pro_b[word].items()) == 1:
res.append(list(pro_b[word].keys())[0])
new_pre[list(pro_b[word].keys())[0]] = 1
pre = new_pre
continue
max_p, cur = float("-inf"), None
for x in pro_b[word].keys():
p = sum(pre[pre_x]*pro_a[pre_x].get(x, 0) for pre_x in pre.keys())*pro_b[word][x]
new_pre[x] = p
if p > max_p:
max_p = p
cur = x
res.append(cur)
pre = new_pre
print("Output: ")
print(*zip(data,res))
verterbi(test_data)
Output: ('The', 'DT') ('time', 'NN') ('that', 'IN') ('my', 'PRP$') ('journey', 'NN') ('takes', 'VBZ') ('is', 'VBZ') ('long', 'RB') ('and', 'CC') ('the', 'DT') ('way', 'NN') ('of', 'IN') ('it', 'PRP') ('long', 'RB')
from stanfordcorenlp import StanfordCoreNLP
import nltk
from nltk.tree import Tree as nltkTree
##读取stanford-corenlp所在的目录
nlp = StanfordCoreNLP(r'C:\Users\User\Downloads\stanford-corenlp-latest\stanford-corenlp-4.3.2')
#输入句子
sentence = test_data
print('Part of Speech:', nlp.pos_tag(sentence))
Part of Speech: [('The', 'DT'), ('time', 'NN'), ('that', 'WDT'), ('my', 'PRP$'), ('journey', 'NN'), ('takes', 'VBZ'), ('is', 'VBZ'), ('long', 'JJ'), ('and', 'CC'), ('the', 'DT'), ('way', 'NN'), ('of', 'IN'), ('it', 'PRP'), ('long', 'RB')]
!pip show stanfordcorenlp
Name: stanfordcorenlp Version: 3.9.1.1 Summary: Python wrapper for Stanford CoreNLP. Home-page: https://github.com/Lynten/stanford-corenlp Author: Lynten Guo Author-email: 1216920263@qq.com License: MIT License Location: c:\programdata\anaconda3\lib\site-packages Requires: requests, psutil Required-by: