如何解决计算n_grammes困惑后的结果很奇怪
希望您能帮助我解决这个问题;
所以我训练了3个n_grammes
模型(n=1,2,3)
,但在计算出与现实不符的困惑(unigram模型的困惑低于bigram和trigram ...)之后,我发现了奇怪的结果。
这是代码
'''
proverbs_fn = "./data/proverbes.txt"
test1_fn = "./data/test_proverbes1.txt"
test_reponses_fn = "./data/reponses_du_test.txt"
n_grammes1=[]
n_grammes2=[]
n_grammes3=[]
model1 = Laplace(1)
model2 = Laplace(2)
model3 = Laplace(3)
def load_proverbs(filename):
with open(filename,'r',encoding='utf-8') as f:
raw_lines = f.readlines()
return [x.strip() for x in raw_lines]
def load_tests(filename):
with open(filename,encoding='utf-8') as fp:
test_data = json.load(fp)
return test_data
def load_tests_reponses(filename):
with open(filename,encoding='utf-8') as f:
raw_lines = f.readlines()
return [x.strip() for x in raw_lines]
def train_models(filename):
proverbs = load_proverbs(filename)
for i in proverbs:
tokens = word_tokenize(i)
token_sentence=list(ngrams(pad_both_ends(tokens,n=1),n=1))
for j in token_sentence:
n_grammes1.append(j)
for k in proverbs:
tokens = word_tokenize(k)
token_sentence=list(ngrams(pad_both_ends(tokens,n=2),n=2))
for l in token_sentence:
n_grammes2.append(l)
for m in proverbs:
tokens = word_tokenize(m)
token_sentence=list(ngrams(pad_both_ends(tokens,n=3),n=3))
for n in token_sentence:
n_grammes3.append(n)
vocabulary = build_vocabulary(proverbs)
model1.fit([n_grammes1],vocabulary_text=vocabulary)
model2.fit([n_grammes2],vocabulary_text=vocabulary)
model3.fit([n_grammes3],vocabulary_text=vocabulary)
print("\n probabilites des n-grammes (n= 1,3) :\n")
print("\t",model1.score("mentir",context=None))
print("\t",model2.score("mentir",["beau"]))
print("\t",model3.score("mentir",("a","beau")))
print("\n perplexites des n-grammes (n= 1,3) : \n")
print("\t",model1.perplexity([("aide-toi",),(",",("le",("ciel",("t",)]))
print("\t",model2.perplexity([("aide-toi","),"le"),"ciel"),"t")]))
print("\t",model3.perplexity([("aide-toi","ciel","t")]))
'''
结果是:
n个语法概率(n = 1,3):
0.00015090922809929828
0.00044375416019525185
0.00044672771945499217
n个语法的困惑(n = 1,3):
600.345333247329
551.5106526623035
2237.4997765363005
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。