英文文本构建语义网络图

提出问题

最近有些文档想要可视化展示效果，奈何一些软件需要付费，且展示的效果不尽人意。因此，结合之前做过的一些可视化研究，利用python重写了相关语义网络可视化效果。

解决问题

原理：通过计算共现次数，保留共现次数高的(这里是>=4次)。

import docx
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
import networkx as nx
import matplotlib.pyplot as plt

def read_word_file(file_path):
    # Open the Word document
    doc = docx.Document(file_path)
    # Store the text content
    text = ""
    for para in doc.paragraphs:
        text += para.text + " "
    return text

def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text.lower())

    # Remove stopwords, punctuation, and non-alphabetic words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalpha() and word not in stop_words]

    return words

def calculate_co_occurrence(words):
    # Calculate co-occurrence relationships between words
    co_occurrence = {}
    window_size = 3  # Adjust the window size as needed

    for i, word in enumerate(words):
        for j in range(1, window_size + 1):
            if i + j < len(words):
                next_word = words[i + j]
                if word != next_word:
                    key = (word, next_word)
                    co_occurrence[key] = co_occurrence.get(key, 0) + 1

    return co_occurrence

def create_semantic_network(co_occurrence):
    # Remove co-occurrence relationships with a count of 1
    co_occurrence = {key: value for key, value in co_occurrence.items() if value > 3}

    # Create a weighted graph with co-occurrence counts as edge weights
    graph = nx.DiGraph()

    for (word1, word2), count in co_occurrence.items():
        graph.add_edge(word1, word2, weight=count)

    return graph

def cluster_graph(graph):
    # Perform clustering on the graph using Louvain method
    clustering = nx.algorithms.community.modularity_max.greedy_modularity_communities(graph)
    clusters = {node: cluster_id for cluster_id, cluster in enumerate(clustering) for node in cluster}
    return clusters

def visualize_semantic_network(graph, central_words, clusters):
    # Draw the semantic network graph with clustering information
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(graph, k=0.7, seed=42)  # Adjust 'k' for node dispersion

    # Extract edge weights for edge labels
    edge_labels = {(word1, word2): graph[word1][word2]['weight'] for word1, word2 in graph.edges()}

    # Determine node colors based on clusters
    node_colors = [clusters[node] for node in graph.nodes()]
    cmap = plt.get_cmap('tab20', max(node_colors) + 1)
    nx.draw(graph, pos, with_labels=True, node_size=300, node_color=node_colors, cmap=cmap,
            font_size=10, font_weight='bold', width=0.5, edge_color='gray', arrows=True, arrowstyle='->',
            connectionstyle='arc3,rad=0.2', alpha=0.7)

    # Draw the custom nodes (central words) with larger size and red color
    custom_nodes = [node for node in central_words if node in graph.nodes()]
    for node in custom_nodes:
        graph.add_node(node)  # Add central word to the graph if it's not already present
    nx.draw(graph, pos, with_labels=True, node_size=1000, node_color='red', cmap=cmap,
            font_size=10, font_weight='bold', width=0.5, edge_color='gray', arrows=True, arrowstyle='->',
            alpha=0.7, nodelist=custom_nodes, node_shape='s')

    # Draw edge labels (co-occurrence counts)
    nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels, font_size=8)

    # Save the plot
    plt.savefig('semantic_network.png', dpi=300)
    plt.show()

# Rest of the code remains the same

def main():
    file_path = "./data/xxx.docx"  # Replace with the actual path of your Word document

    # Read the Word document
    text = read_word_file(file_path)

    # Preprocess the text
    words = preprocess_text(text)

    # Calculate co-occurrence relationships between words
    co_occurrence = calculate_co_occurrence(words)

    # Find the central words (the top three most frequent words)
    central_words = [word for word, _ in FreqDist(words).most_common(3)]

    # Create a semantic network graph
    graph = create_semantic_network(co_occurrence)

    # Perform clustering on the graph
    clusters = cluster_graph(graph)

    # Visualize the semantic network graph with the top three central words marked in red and clustering information
    visualize_semantic_network(graph, central_words, clusters)

if __name__ == "__main__":
    main()

结果

Post Views: 593

本作品采用知识共享署名 4.0 国际许可协议进行许可

英文文本构建语义网络图

提出问题

解决问题

结果

相关文章：

文章评论