基本信息
源码名称:LDA一致性与困惑度分析
源码大小:3.29KB
文件格式:.py
开发语言:Python
更新时间:2024-08-20
   友情提示:(无需注册或充值,赞助后即可获取资源下载链接)

     嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300

本次赞助数额为: 2 元 
   源码介绍

LDA一致性与困惑度分析


import pandas as pd
import jieba
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor

# Load stopwords
def stopwordslist(filepath):  
    with open(filepath, 'r', encoding='utf-8') as file:
        stopwords = [line.strip() for line in file.readlines()]
    return stopwords

stopwords = stopwordslist('C:/Users/lenovo/Desktop/lda_results/停用词.txt')

# Define data cleaning function
def data_cleaning(content_list):
    content_seg = []
    symbols = set('-\\n~%≥℃|/​``​↓#~_「♂!?\',、:;。《》()()·—.…,0123456789abcdefghijklnmopqrstuvwxyz')

    for content in content_list:
        content = ''.join([' ' if con in symbols else con for con in content])
        con_list = jieba.cut(content, cut_all=False)
        result_list = [con for con in con_list if con not in stopwords and con.strip()]
        content_seg.append(' '.join(result_list))

    return content_seg

# Read text data line by line
with open('C:/Users/lenovo/Desktop/lda_results/待分析数据.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Segment text and remove stopwords
participle = data_cleaning(lines)
df = pd.DataFrame({'文章内容': lines, '文章内容去停用词分词结果': participle})

# Build dictionary and bag-of-words model
train_set = df['文章内容去停用词分词结果'].apply(lambda x: x.split())
dictionary = corpora.Dictionary(train_set)
corpus = [dictionary.doc2bow(text) for text in train_set]

# Define function to compute perplexity and coherence
def compute_metrics(num_topics):
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=5, random_state=1)
    perplexity_score = lda_model.log_perplexity(corpus)
    coherence_model = CoherenceModel(model=lda_model, texts=train_set, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return num_topics, perplexity_score, coherence_score

if __name__ == "__main__":
    # Compute perplexity and coherence in parallel
    num_topics_range = range(2, 11)
    with ProcessPoolExecutor() as executor:
        results = list(executor.map(compute_metrics, num_topics_range))

    # Extract results
    num_topics_list, perplexity_scores, coherence_scores = zip(*results)

    # Plot perplexity and coherence curves on one graph with two y-axes
    fig, ax1 = plt.subplots(figsize=(12, 6))  # Adjust the figure size for a suitable aspect ratio

    color = 'tab:blue'
    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('Perplexity', color=color)
    ax1.plot(num_topics_list, perplexity_scores, marker='o', color=color, linestyle='-')
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  
    color = 'tab:orange'
    ax2.set_ylabel('Coherence', color=color)
    ax2.plot(num_topics_list, coherence_scores, marker='o', color=color, linestyle='--')
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()  
    plt.title('Perplexity and Coherence vs. Number of Topics')

    # Save the plot as a PDF
    plt.savefig('C:/Users/lenovo/Desktop/lda_results/perplexity_coherence3.pdf', format='pdf')
    plt.show()