嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元微信扫码支付:2 元
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
LDA一致性与困惑度分析
import pandas as pd
import jieba
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor
# Load stopwords
def stopwordslist(filepath):
with open(filepath, 'r', encoding='utf-8') as file:
stopwords = [line.strip() for line in file.readlines()]
return stopwords
stopwords = stopwordslist('C:/Users/lenovo/Desktop/lda_results/停用词.txt')
# Define data cleaning function
def data_cleaning(content_list):
content_seg = []
symbols = set('-\\n~%≥℃|/​``​↓#~_「♂!?\',、:;。《》()()·—.…,0123456789abcdefghijklnmopqrstuvwxyz')
for content in content_list:
content = ''.join([' ' if con in symbols else con for con in content])
con_list = jieba.cut(content, cut_all=False)
result_list = [con for con in con_list if con not in stopwords and con.strip()]
content_seg.append(' '.join(result_list))
return content_seg
# Read text data line by line
with open('C:/Users/lenovo/Desktop/lda_results/待分析数据.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()
# Segment text and remove stopwords
participle = data_cleaning(lines)
df = pd.DataFrame({'文章内容': lines, '文章内容去停用词分词结果': participle})
# Build dictionary and bag-of-words model
train_set = df['文章内容去停用词分词结果'].apply(lambda x: x.split())
dictionary = corpora.Dictionary(train_set)
corpus = [dictionary.doc2bow(text) for text in train_set]
# Define function to compute perplexity and coherence
def compute_metrics(num_topics):
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=5, random_state=1)
perplexity_score = lda_model.log_perplexity(corpus)
coherence_model = CoherenceModel(model=lda_model, texts=train_set, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
return num_topics, perplexity_score, coherence_score
if __name__ == "__main__":
# Compute perplexity and coherence in parallel
num_topics_range = range(2, 11)
with ProcessPoolExecutor() as executor:
results = list(executor.map(compute_metrics, num_topics_range))
# Extract results
num_topics_list, perplexity_scores, coherence_scores = zip(*results)
# Plot perplexity and coherence curves on one graph with two y-axes
fig, ax1 = plt.subplots(figsize=(12, 6)) # Adjust the figure size for a suitable aspect ratio
color = 'tab:blue'
ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Perplexity', color=color)
ax1.plot(num_topics_list, perplexity_scores, marker='o', color=color, linestyle='-')
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Coherence', color=color)
ax2.plot(num_topics_list, coherence_scores, marker='o', color=color, linestyle='--')
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.title('Perplexity and Coherence vs. Number of Topics')
# Save the plot as a PDF
plt.savefig('C:/Users/lenovo/Desktop/lda_results/perplexity_coherence3.pdf', format='pdf')
plt.show()