LDA一致性与困惑度分析

基本信息

源码名称：LDA一致性与困惑度分析

源码大小：3.29KB

文件格式：.py

开发语言：Python

更新时间：2024-08-20

友情提示：（无需注册或充值，赞助后即可获取资源下载链接）

嘿，亲！知识可是无价之宝呢，但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下，绝对物超所值哦！如有下载和支付问题，请联系我们QQ(微信同号)：813200300

本次赞助数额为： 2 元　

源码介绍

LDA一致性与困惑度分析

import pandas as pd
import jieba
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor

# Load stopwords
def stopwordslist(filepath):
with open(filepath, 'r', encoding='utf-8') as file:
stopwords = [line.strip() for line in file.readlines()]
return stopwords

stopwords = stopwordslist('C:/Users/lenovo/Desktop/lda_results/停用词.txt')

# Define data cleaning function
def data_cleaning(content_list):
content_seg = []
symbols = set('-\\n～%≥℃|/``↓#~_「♂!？\'，、:；。《》()（）·—.…,0123456789abcdefghijklnmopqrstuvwxyz')

for content in content_list:
content = ''.join([' ' if con in symbols else con for con in content])
con_list = jieba.cut(content, cut_all=False)
result_list = [con for con in con_list if con not in stopwords and con.strip()]
content_seg.append(' '.join(result_list))

return content_seg

# Read text data line by line
with open('C:/Users/lenovo/Desktop/lda_results/待分析数据.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()

# Segment text and remove stopwords
participle = data_cleaning(lines)
df = pd.DataFrame({'文章内容': lines, '文章内容去停用词分词结果': participle})

# Build dictionary and bag-of-words model
train_set = df['文章内容去停用词分词结果'].apply(lambda x: x.split())
dictionary = corpora.Dictionary(train_set)
corpus = [dictionary.doc2bow(text) for text in train_set]

# Define function to compute perplexity and coherence
def compute_metrics(num_topics):
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=5, random_state=1)
perplexity_score = lda_model.log_perplexity(corpus)
coherence_model = CoherenceModel(model=lda_model, texts=train_set, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
return num_topics, perplexity_score, coherence_score

if __name__ == "__main__":
# Compute perplexity and coherence in parallel
num_topics_range = range(2, 11)
with ProcessPoolExecutor() as executor:
results = list(executor.map(compute_metrics, num_topics_range))

# Extract results
num_topics_list, perplexity_scores, coherence_scores = zip(*results)

# Plot perplexity and coherence curves on one graph with two y-axes
fig, ax1 = plt.subplots(figsize=(12, 6)) # Adjust the figure size for a suitable aspect ratio

color = 'tab:blue'
ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Perplexity', color=color)
ax1.plot(num_topics_list, perplexity_scores, marker='o', color=color, linestyle='-')
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Coherence', color=color)
ax2.plot(num_topics_list, coherence_scores, marker='o', color=color, linestyle='--')
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title('Perplexity and Coherence vs. Number of Topics')

# Save the plot as a PDF
plt.savefig('C:/Users/lenovo/Desktop/lda_results/perplexity_coherence3.pdf', format='pdf')
plt.show()