import streamlit as st import re import nltk from nltk.corpus import stopwords from nltk import FreqDist from graphviz import Digraph nltk.download('punkt') nltk.download('stopwords') def remove_timestamps(text): return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text) # Updated regex pattern def process_text(text): lines = text.split("\n") processed_lines = [] for line in lines: if line: processed_lines.append(line) outline = "" for i, line in enumerate(processed_lines): if i % 2 == 0: outline += f"**{line}**\n" else: outline += f"- {line} 😄\n" return outline def extract_high_information_words(text, top_n=10): words = nltk.word_tokenize(text) words = [word.lower() for word in words if word.isalpha()] stop_words = set(stopwords.words('english')) filtered_words = [word for word in words if word not in stop_words] freq_dist = FreqDist(filtered_words) high_information_words = [word for word, _ in freq_dist.most_common(top_n)] return high_information_words def create_relationship_graph(words): graph = Digraph() for index, word in enumerate(words): graph.node(str(index), word) if index > 0: graph.edge(str(index - 1), str(index), label=str(index)) return graph def display_relationship_graph(words): graph = create_relationship_graph(words) st.graphviz_chart(graph) uploaded_file = st.file_uploader("Choose a .txt file", type=['txt']) if uploaded_file: file_text = uploaded_file.read().decode("utf-8") text_without_timestamps = remove_timestamps(file_text) top_words = extract_high_information_words(text_without_timestamps, 10) st.markdown("**Top 10 High Information Words:**") st.write(top_words) st.markdown("**Relationship Graph:**") display_relationship_graph(top_words)