File size: 7,408 Bytes
1454a26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f608dd8
 
1454a26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# Import necessary libraries
import streamlit as st
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk import FreqDist
from graphviz import Digraph
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Set page configuration with a title and favicon
st.set_page_config(
    page_title="๐Ÿ“บTranscript๐Ÿ“œEDA๐Ÿ”NLTK",
    page_icon="๐ŸŒ ",
    layout="wide",
    initial_sidebar_state="expanded",
    menu_items={
        'Get Help': 'https://huggingface.co/awacke1',
        'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
        'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
    }
)

st.markdown('''๐Ÿ” **Exploratory Data Analysis (EDA)** ๐Ÿ“Š:     - Dive deep into the sea of data with our EDA feature, unveiling hidden patterns ๐Ÿ•ต๏ธโ€โ™‚๏ธ and insights ๐Ÿง  in your transcripts. Transform raw data into a treasure trove of information ๐Ÿ†.
๐Ÿ“œ **Natural Language Toolkit (NLTK)** ๐Ÿ› ๏ธ:     - Harness the power of NLTK to process and understand human language ๐Ÿ—ฃ๏ธ. From tokenization to sentiment analysis, our toolkit is your compass ๐Ÿงญ in the vast landscape of natural language processing (NLP).
๐Ÿ“บ **Transcript Analysis** ๐Ÿ“ˆ:     - Elevate your text analysis with our advanced transcript analysis tools. Whether it's speech recognition ๐ŸŽ™๏ธ or thematic extraction ๐ŸŒ, turn your audiovisual content into actionable insights ๐Ÿ”‘.''')

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def remove_timestamps(text):
    return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)

def extract_high_information_words(text, top_n=10):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    freq_dist = FreqDist(filtered_words)
    return [word for word, _ in freq_dist.most_common(top_n)]

def create_relationship_graph(words):
    graph = Digraph()
    for index, word in enumerate(words):
        graph.node(str(index), word)
        if index > 0:
            graph.edge(str(index - 1), str(index), label=str(index))
    return graph

def display_relationship_graph(words):
    graph = create_relationship_graph(words)
    st.graphviz_chart(graph)

def extract_context_words(text, high_information_words):
    words = nltk.word_tokenize(text)
    context_words = []
    for index, word in enumerate(words):
        if word.lower() in high_information_words:
            before_word = words[index - 1] if index > 0 else None
            after_word = words[index + 1] if index < len(words) - 1 else None
            context_words.append((before_word, word, after_word))
    return context_words

def create_context_graph(context_words):
    graph = Digraph()
    for index, (before_word, high_info_word, after_word) in enumerate(context_words):
        #graph.node(f'before{index}', before_word, shape='box') if before_word else None
        if before_word:  graph.node(f'before{index}', before_word, shape='box') # else None
        graph.node(f'high{index}', high_info_word, shape='ellipse')
        #graph.node(f'after{index}', after_word, shape='diamond') if after_word else None
        if after_word: graph.node(f'after{index}', after_word, shape='diamond') # else None
        if before_word:
            graph.edge(f'before{index}', f'high{index}')
        if after_word:
            graph.edge(f'high{index}', f'after{index}')
    return graph

def display_context_graph(context_words):
    graph = create_context_graph(context_words)
    st.graphviz_chart(graph)

def display_context_table(context_words):
    table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
    for before, high, after in context_words:
        table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
    st.markdown(table)

def load_example_files():
    # Exclude specific files
    excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
    
    # List all .txt files excluding the ones in excluded_files
    example_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
    
    # Check if there are any files to select from
    if example_files:
        selected_file = st.selectbox("๐Ÿ“„ Select an example file:", example_files)
        if st.button(f"๐Ÿ“‚ Load {selected_file}"):
            with open(selected_file, 'r', encoding="utf-8") as file:
                return file.read()
    else:
        st.write("No suitable example files found.")
    
    return None

def cluster_sentences(sentences, num_clusters):
    # Filter sentences with length over 10 characters
    sentences = [sentence for sentence in sentences if len(sentence) > 10]

    # Check if the number of sentences is less than the desired number of clusters
    if len(sentences) < num_clusters:
        # If so, adjust the number of clusters to match the number of sentences
        num_clusters = len(sentences)

    # Vectorize the sentences
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)

    # Perform k-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X)

    # Get the cluster labels for each sentence
    labels = kmeans.labels_

    # Group sentences by cluster
    clustered_sentences = [[] for _ in range(num_clusters)]
    for i, label in enumerate(labels):
        clustered_sentences[label].append((i, sentences[i]))

    return clustered_sentences

# Main code for UI
uploaded_file = st.file_uploader("๐Ÿ“ Choose a .txt file", type=['txt'])

example_text = load_example_files()

if example_text:
    file_text = example_text
elif uploaded_file:
    file_text = uploaded_file.read().decode("utf-8")
else:
    file_text = ""

if file_text:
    text_without_timestamps = remove_timestamps(file_text)
    sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]

    top_words = extract_high_information_words(text_without_timestamps, 10)

    with st.expander("๐Ÿ“Š Top 10 High Information Words"):
        st.write(top_words)

    with st.expander("๐Ÿ“ˆ Relationship Graph"):
        display_relationship_graph(top_words)

    context_words = extract_context_words(text_without_timestamps, top_words)

    with st.expander("๐Ÿ”— Context Graph"):
        display_context_graph(context_words)

    with st.expander("๐Ÿ“‘ Context Table"):
        display_context_table(context_words)

    # with st.expander("Innovation Outlines"):
    #    showInnovationOutlines()

    with st.expander("๐Ÿ“ Sentence Clustering"):
        num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
        clustered_sentences = cluster_sentences(sentences, num_clusters)

        output_text = ""
        for i, cluster in enumerate(clustered_sentences):
            output_text += f"## ๐ŸŒŸ Cluster {i+1}:\n"
            for original_index, sentence in cluster:
                output_text += f"- Original Line {original_index+1}: {sentence}\n"
            output_text += "\n"

        st.markdown(output_text)

st.markdown("https://cdn-uploads.huggingface.co/production/uploads/620630b603825909dcbeba35/Id9kntHFHZf_oFFrEmGh5.png")