awacke1's picture
Create app.py
38b1c07 verified
raw
history blame
4.22 kB
import streamlit as st
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk import FreqDist
from graphviz import Digraph
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import linear_kernel
from io import BytesIO
import base64
# Set page configuration with a title and favicon
st.set_page_config(
page_title="📺Transcript📜EDA🔍NLTK",
page_icon="🌠",
layout="wide",
initial_sidebar_state="expanded",
menu_items={
'Get Help': 'https://huggingface.co/awacke1',
'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
}
)
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
def remove_timestamps(text):
return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
def extract_high_information_words(text, top_n=10):
words = nltk.word_tokenize(text)
words = [word.lower() for word in words if word.isalpha()]
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
freq_dist = FreqDist(filtered_words)
return [word for word, _ in freq_dist.most_common(top_n)]
def cluster_sentences(sentences, num_clusters):
# Filter sentences with length over 10 characters
sentences = [sentence for sentence in sentences if len(sentence) > 10]
# Vectorize the sentences
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)
# Perform k-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
# Calculate the centroid of each cluster
cluster_centers = kmeans.cluster_centers_
# Group sentences by cluster and calculate similarity to centroid
clustered_sentences = [[] for _ in range(num_clusters)]
for i, label in enumerate(kmeans.labels_):
similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
clustered_sentences[label].append((similarity, sentences[i]))
# Order sentences within each cluster based on their similarity to the centroid
for cluster in clustered_sentences:
cluster.sort(reverse=True) # Sort based on similarity (descending order)
# Return the ordered clustered sentences without similarity scores for display
return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
# Function to convert text to a downloadable file
def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
buffer = BytesIO()
buffer.write(text_to_download.encode())
buffer.seek(0)
b64 = base64.b64encode(buffer.read()).decode()
href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
return href
# Main code for UI
uploaded_file = st.file_uploader("📁 Choose a .txt file", type=['txt'])
if uploaded_file:
file_text = uploaded_file.read().decode("utf-8")
else:
file_text = ""
if file_text:
text_without_timestamps = remove_timestamps(file_text)
sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]
with st.expander("📝 Sentence Clustering"):
num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
clustered_sentences = cluster_sentences(sentences, num_clusters)
for i, cluster in enumerate(clustered_sentences):
st.text_area(f"Cluster {i+1}", value="\n".join(cluster), height=100)
# Input for custom filename
default_filename = f"Cluster_{i+1}_Output.txt"
filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}")
# Download button
download_link = get_text_file_download_link("\n".join(cluster), filename, f"💾 Save Cluster {i+1}")
st.markdown(download_link, unsafe_allow_html=True)
st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")