Spaces:

awacke1
/

Transcript-EDA-NLTK

Sleeping

App Files Files Community

Transcript-EDA-NLTK / app.py

awacke1

Create app.py

38b1c07 verified about 1 year ago

raw

history blame

4.22 kB

	import streamlit as st
	import re
	import nltk
	import os
	from nltk.corpus import stopwords
	from nltk import FreqDist
	from graphviz import Digraph
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans
	from sklearn.metrics.pairwise import linear_kernel
	from io import BytesIO
	import base64

	# Set page configuration with a title and favicon
	st.set_page_config(
	page_title="📺Transcript📜EDA🔍NLTK",
	page_icon="🌠",
	layout="wide",
	initial_sidebar_state="expanded",
	menu_items={
	'Get Help': 'https://huggingface.co/awacke1',
	'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
	'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
	}
	)

	# Download NLTK resources
	nltk.download('punkt')
	nltk.download('stopwords')

	def remove_timestamps(text):
	return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)

	def extract_high_information_words(text, top_n=10):
	words = nltk.word_tokenize(text)
	words = [word.lower() for word in words if word.isalpha()]
	stop_words = set(stopwords.words('english'))
	filtered_words = [word for word in words if word not in stop_words]
	freq_dist = FreqDist(filtered_words)
	return [word for word, _ in freq_dist.most_common(top_n)]

	def cluster_sentences(sentences, num_clusters):
	# Filter sentences with length over 10 characters
	sentences = [sentence for sentence in sentences if len(sentence) > 10]

	# Vectorize the sentences
	vectorizer = TfidfVectorizer()
	X = vectorizer.fit_transform(sentences)

	# Perform k-means clustering
	kmeans = KMeans(n_clusters=num_clusters, random_state=42)
	kmeans.fit(X)

	# Calculate the centroid of each cluster
	cluster_centers = kmeans.cluster_centers_

	# Group sentences by cluster and calculate similarity to centroid
	clustered_sentences = [[] for _ in range(num_clusters)]
	for i, label in enumerate(kmeans.labels_):
	similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
	clustered_sentences[label].append((similarity, sentences[i]))

	# Order sentences within each cluster based on their similarity to the centroid
	for cluster in clustered_sentences:
	cluster.sort(reverse=True) # Sort based on similarity (descending order)

	# Return the ordered clustered sentences without similarity scores for display
	return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]

	# Function to convert text to a downloadable file
	def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
	buffer = BytesIO()
	buffer.write(text_to_download.encode())
	buffer.seek(0)
	b64 = base64.b64encode(buffer.read()).decode()
	href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
	return href

	# Main code for UI
	uploaded_file = st.file_uploader("📁 Choose a .txt file", type=['txt'])

	if uploaded_file:
	file_text = uploaded_file.read().decode("utf-8")
	else:
	file_text = ""

	if file_text:
	text_without_timestamps = remove_timestamps(file_text)
	sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]

	with st.expander("📝 Sentence Clustering"):
	num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
	clustered_sentences = cluster_sentences(sentences, num_clusters)

	for i, cluster in enumerate(clustered_sentences):
	st.text_area(f"Cluster {i+1}", value="\n".join(cluster), height=100)

	# Input for custom filename
	default_filename = f"Cluster_{i+1}_Output.txt"
	filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}")

	# Download button
	download_link = get_text_file_download_link("\n".join(cluster), filename, f"💾 Save Cluster {i+1}")
	st.markdown(download_link, unsafe_allow_html=True)

	st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")