Spaces:

awacke1
/

Transcript-EDA-NLTK

Sleeping

App Files Files Community

Transcript-EDA-NLTK / app.py

awacke1

Create app.py

069bed5 verified 9 months ago

raw

history blame

4.01 kB

	import streamlit as st
	import pandas as pd
	import os
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans
	from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
	import nltk
	from nltk.corpus import stopwords
	from nltk import FreqDist
	import re
	import base64
	from graphviz import Digraph
	from io import BytesIO
	import networkx as nx
	import matplotlib.pyplot as plt

	# ... [Keep all the existing imports and configurations] ...

	def get_txt_files():
	# Exclude specific files
	excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}

	# List all .txt files excluding the ones in excluded_files
	txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]

	# Create a dataframe with file names and full paths
	df = pd.DataFrame({
	'File Name': txt_files,
	'Full Path': [os.path.abspath(f) for f in txt_files]
	})

	return df

	# ... [Keep all the existing functions] ...

	# Main code for UI
	st.title("📺 Transcript Analysis 📊")

	# Display dataframe of .txt files
	txt_files_df = get_txt_files()
	st.write("Available .txt files:")
	st.dataframe(txt_files_df)

	# Allow user to select a file from the dataframe
	selected_file = st.selectbox("Select a file to process:", txt_files_df['File Name'])

	if st.button(f"Process {selected_file}"):
	file_path = txt_files_df[txt_files_df['File Name'] == selected_file]['Full Path'].iloc[0]
	with open(file_path, 'r', encoding="utf-8") as file:
	file_text = file.read()

	# Process the selected file
	text_without_timestamps = remove_timestamps(file_text)
	top_words = extract_high_information_words(text_without_timestamps, 10)

	with st.expander("📊 Top 10 High Information Words"):
	st.write(top_words)

	with st.expander("📈 Relationship Graph"):
	display_relationship_graph(top_words)

	context_words = extract_context_words(text_without_timestamps, top_words)

	with st.expander("🔗 Context Graph"):
	display_context_graph(context_words)

	with st.expander("📑 Context Table"):
	display_context_table(context_words)

	sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]

	num_sentences = len(sentences)
	st.write(f"Total Sentences: {num_sentences}")

	num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
	clustered_sentences = cluster_sentences(sentences, num_clusters)

	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Original Text")
	original_text = "\n".join(sentences)
	st.text_area("Original Sentences", value=original_text, height=400)

	with col2:
	st.subheader("Clustered Text")
	clusters = ""
	clustered_text = ""
	cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)

	for i, cluster in enumerate(clustered_sentences):
	cluster_text = "\n".join(cluster)
	high_info_words = ", ".join(cluster_high_info_words[i])
	clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
	clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"

	st.text_area("Clusters", value=clusters, height=200)
	st.text_area("Clustered Sentences", value=clustered_text, height=200)

	# Verify that all sentences are accounted for in the clustered output
	clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
	if set(sentences) == set(clustered_sentences_flat):
	st.write("✅ All sentences are accounted for in the clustered output.")
	else:
	st.write("❌ Some sentences are missing in the clustered output.")

	plot_cluster_words(clustered_sentences)

	st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")