File size: 4,223 Bytes
38b1c07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import streamlit as st
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk import FreqDist
from graphviz import Digraph
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import linear_kernel
from io import BytesIO
import base64

# Set page configuration with a title and favicon
st.set_page_config(
    page_title="📺Transcript📜EDA🔍NLTK",
    page_icon="🌠",
    layout="wide",
    initial_sidebar_state="expanded",
    menu_items={
        'Get Help': 'https://huggingface.co/awacke1',
        'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
        'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
    }
)

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def remove_timestamps(text):
    return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)

def extract_high_information_words(text, top_n=10):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    freq_dist = FreqDist(filtered_words)
    return [word for word, _ in freq_dist.most_common(top_n)]

def cluster_sentences(sentences, num_clusters):
    # Filter sentences with length over 10 characters
    sentences = [sentence for sentence in sentences if len(sentence) > 10]

    # Vectorize the sentences
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)

    # Perform k-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X)

    # Calculate the centroid of each cluster
    cluster_centers = kmeans.cluster_centers_

    # Group sentences by cluster and calculate similarity to centroid
    clustered_sentences = [[] for _ in range(num_clusters)]
    for i, label in enumerate(kmeans.labels_):
        similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
        clustered_sentences[label].append((similarity, sentences[i]))

    # Order sentences within each cluster based on their similarity to the centroid
    for cluster in clustered_sentences:
        cluster.sort(reverse=True)  # Sort based on similarity (descending order)

    # Return the ordered clustered sentences without similarity scores for display
    return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]

# Function to convert text to a downloadable file
def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
    buffer = BytesIO()
    buffer.write(text_to_download.encode())
    buffer.seek(0)
    b64 = base64.b64encode(buffer.read()).decode()
    href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
    return href

# Main code for UI
uploaded_file = st.file_uploader("📁 Choose a .txt file", type=['txt'])

if uploaded_file:
    file_text = uploaded_file.read().decode("utf-8")
else:
    file_text = ""

if file_text:
    text_without_timestamps = remove_timestamps(file_text)
    sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]

    with st.expander("📝 Sentence Clustering"):
        num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
        clustered_sentences = cluster_sentences(sentences, num_clusters)

        for i, cluster in enumerate(clustered_sentences):
            st.text_area(f"Cluster {i+1}", value="\n".join(cluster), height=100)
            
            # Input for custom filename
            default_filename = f"Cluster_{i+1}_Output.txt"
            filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}")
            
            # Download button
            download_link = get_text_file_download_link("\n".join(cluster), filename, f"💾 Save Cluster {i+1}")
            st.markdown(download_link, unsafe_allow_html=True)

st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")