awacke1's picture
Create app.py
069bed5 verified
raw
history blame
4.01 kB
import streamlit as st
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
import re
import base64
from graphviz import Digraph
from io import BytesIO
import networkx as nx
import matplotlib.pyplot as plt
# ... [Keep all the existing imports and configurations] ...
def get_txt_files():
# Exclude specific files
excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
# List all .txt files excluding the ones in excluded_files
txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
# Create a dataframe with file names and full paths
df = pd.DataFrame({
'File Name': txt_files,
'Full Path': [os.path.abspath(f) for f in txt_files]
})
return df
# ... [Keep all the existing functions] ...
# Main code for UI
st.title("πŸ“Ί Transcript Analysis πŸ“Š")
# Display dataframe of .txt files
txt_files_df = get_txt_files()
st.write("Available .txt files:")
st.dataframe(txt_files_df)
# Allow user to select a file from the dataframe
selected_file = st.selectbox("Select a file to process:", txt_files_df['File Name'])
if st.button(f"Process {selected_file}"):
file_path = txt_files_df[txt_files_df['File Name'] == selected_file]['Full Path'].iloc[0]
with open(file_path, 'r', encoding="utf-8") as file:
file_text = file.read()
# Process the selected file
text_without_timestamps = remove_timestamps(file_text)
top_words = extract_high_information_words(text_without_timestamps, 10)
with st.expander("πŸ“Š Top 10 High Information Words"):
st.write(top_words)
with st.expander("πŸ“ˆ Relationship Graph"):
display_relationship_graph(top_words)
context_words = extract_context_words(text_without_timestamps, top_words)
with st.expander("πŸ”— Context Graph"):
display_context_graph(context_words)
with st.expander("πŸ“‘ Context Table"):
display_context_table(context_words)
sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
num_sentences = len(sentences)
st.write(f"Total Sentences: {num_sentences}")
num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
clustered_sentences = cluster_sentences(sentences, num_clusters)
col1, col2 = st.columns(2)
with col1:
st.subheader("Original Text")
original_text = "\n".join(sentences)
st.text_area("Original Sentences", value=original_text, height=400)
with col2:
st.subheader("Clustered Text")
clusters = ""
clustered_text = ""
cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
for i, cluster in enumerate(clustered_sentences):
cluster_text = "\n".join(cluster)
high_info_words = ", ".join(cluster_high_info_words[i])
clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
st.text_area("Clusters", value=clusters, height=200)
st.text_area("Clustered Sentences", value=clustered_text, height=200)
# Verify that all sentences are accounted for in the clustered output
clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
if set(sentences) == set(clustered_sentences_flat):
st.write("βœ… All sentences are accounted for in the clustered output.")
else:
st.write("❌ Some sentences are missing in the clustered output.")
plot_cluster_words(clustered_sentences)
st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")