Spaces:
Sleeping
Sleeping
import streamlit as st | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk import FreqDist | |
from graphviz import Digraph | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
def remove_timestamps(text): | |
return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text) # Updated regex pattern | |
def process_text(text): | |
lines = text.split("\n") | |
processed_lines = [] | |
for line in lines: | |
if line: | |
processed_lines.append(line) | |
outline = "" | |
for i, line in enumerate(processed_lines): | |
if i % 2 == 0: | |
outline += f"**{line}**\n" | |
else: | |
outline += f"- {line} 😄\n" | |
return outline | |
def extract_high_information_words(text, top_n=10): | |
words = nltk.word_tokenize(text) | |
words = [word.lower() for word in words if word.isalpha()] | |
stop_words = set(stopwords.words('english')) | |
filtered_words = [word for word in words if word not in stop_words] | |
freq_dist = FreqDist(filtered_words) | |
high_information_words = [word for word, _ in freq_dist.most_common(top_n)] | |
return high_information_words | |
def create_relationship_graph(words): | |
graph = Digraph() | |
for index, word in enumerate(words): | |
graph.node(str(index), word) | |
if index > 0: | |
graph.edge(str(index - 1), str(index), label=str(index)) | |
return graph | |
def display_relationship_graph(words): | |
graph = create_relationship_graph(words) | |
st.graphviz_chart(graph) | |
uploaded_file = st.file_uploader("Choose a .txt file", type=['txt']) | |
if uploaded_file: | |
file_text = uploaded_file.read().decode("utf-8") | |
text_without_timestamps = remove_timestamps(file_text) | |
top_words = extract_high_information_words(text_without_timestamps, 10) | |
st.markdown("**Top 10 High Information Words:**") | |
st.write(top_words) | |
st.markdown("**Relationship Graph:**") | |
display_relationship_graph(top_words) |