Spaces:

awacke1
/

Transcript-EDA-NLTK

Sleeping

App Files Files Community

Transcript-EDA-NLTK / backup.app.py

awacke1

Create backup.app.py

d44e2e0 over 1 year ago

raw

history blame

1.89 kB

	import streamlit as st
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk import FreqDist
	from graphviz import Digraph

	nltk.download('punkt')
	nltk.download('stopwords')

	def remove_timestamps(text):
	return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text) # Updated regex pattern

	def process_text(text):
	lines = text.split("\n")
	processed_lines = []

	for line in lines:
	if line:
	processed_lines.append(line)

	outline = ""
	for i, line in enumerate(processed_lines):
	if i % 2 == 0:
	outline += f"{line}\n"
	else:
	outline += f"- {line} 😄\n"

	return outline

	def extract_high_information_words(text, top_n=10):
	words = nltk.word_tokenize(text)
	words = [word.lower() for word in words if word.isalpha()]

	stop_words = set(stopwords.words('english'))
	filtered_words = [word for word in words if word not in stop_words]

	freq_dist = FreqDist(filtered_words)
	high_information_words = [word for word, _ in freq_dist.most_common(top_n)]

	return high_information_words

	def create_relationship_graph(words):
	graph = Digraph()

	for index, word in enumerate(words):
	graph.node(str(index), word)

	if index > 0:
	graph.edge(str(index - 1), str(index), label=str(index))

	return graph

	def display_relationship_graph(words):
	graph = create_relationship_graph(words)
	st.graphviz_chart(graph)

	uploaded_file = st.file_uploader("Choose a .txt file", type=['txt'])

	if uploaded_file:
	file_text = uploaded_file.read().decode("utf-8")
	text_without_timestamps = remove_timestamps(file_text)

	top_words = extract_high_information_words(text_without_timestamps, 10)
	st.markdown("Top 10 High Information Words:")
	st.write(top_words)

	st.markdown("Relationship Graph:")
	display_relationship_graph(top_words)