Spaces:
Sleeping
Sleeping
File size: 7,408 Bytes
1454a26 f608dd8 1454a26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# Import necessary libraries
import streamlit as st
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk import FreqDist
from graphviz import Digraph
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# Set page configuration with a title and favicon
st.set_page_config(
page_title="๐บTranscript๐EDA๐NLTK",
page_icon="๐ ",
layout="wide",
initial_sidebar_state="expanded",
menu_items={
'Get Help': 'https://huggingface.co/awacke1',
'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
}
)
st.markdown('''๐ **Exploratory Data Analysis (EDA)** ๐: - Dive deep into the sea of data with our EDA feature, unveiling hidden patterns ๐ต๏ธโโ๏ธ and insights ๐ง in your transcripts. Transform raw data into a treasure trove of information ๐.
๐ **Natural Language Toolkit (NLTK)** ๐ ๏ธ: - Harness the power of NLTK to process and understand human language ๐ฃ๏ธ. From tokenization to sentiment analysis, our toolkit is your compass ๐งญ in the vast landscape of natural language processing (NLP).
๐บ **Transcript Analysis** ๐: - Elevate your text analysis with our advanced transcript analysis tools. Whether it's speech recognition ๐๏ธ or thematic extraction ๐, turn your audiovisual content into actionable insights ๐.''')
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
def remove_timestamps(text):
return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
def extract_high_information_words(text, top_n=10):
words = nltk.word_tokenize(text)
words = [word.lower() for word in words if word.isalpha()]
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
freq_dist = FreqDist(filtered_words)
return [word for word, _ in freq_dist.most_common(top_n)]
def create_relationship_graph(words):
graph = Digraph()
for index, word in enumerate(words):
graph.node(str(index), word)
if index > 0:
graph.edge(str(index - 1), str(index), label=str(index))
return graph
def display_relationship_graph(words):
graph = create_relationship_graph(words)
st.graphviz_chart(graph)
def extract_context_words(text, high_information_words):
words = nltk.word_tokenize(text)
context_words = []
for index, word in enumerate(words):
if word.lower() in high_information_words:
before_word = words[index - 1] if index > 0 else None
after_word = words[index + 1] if index < len(words) - 1 else None
context_words.append((before_word, word, after_word))
return context_words
def create_context_graph(context_words):
graph = Digraph()
for index, (before_word, high_info_word, after_word) in enumerate(context_words):
#graph.node(f'before{index}', before_word, shape='box') if before_word else None
if before_word: graph.node(f'before{index}', before_word, shape='box') # else None
graph.node(f'high{index}', high_info_word, shape='ellipse')
#graph.node(f'after{index}', after_word, shape='diamond') if after_word else None
if after_word: graph.node(f'after{index}', after_word, shape='diamond') # else None
if before_word:
graph.edge(f'before{index}', f'high{index}')
if after_word:
graph.edge(f'high{index}', f'after{index}')
return graph
def display_context_graph(context_words):
graph = create_context_graph(context_words)
st.graphviz_chart(graph)
def display_context_table(context_words):
table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
for before, high, after in context_words:
table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
st.markdown(table)
def load_example_files():
# Exclude specific files
excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
# List all .txt files excluding the ones in excluded_files
example_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
# Check if there are any files to select from
if example_files:
selected_file = st.selectbox("๐ Select an example file:", example_files)
if st.button(f"๐ Load {selected_file}"):
with open(selected_file, 'r', encoding="utf-8") as file:
return file.read()
else:
st.write("No suitable example files found.")
return None
def cluster_sentences(sentences, num_clusters):
# Filter sentences with length over 10 characters
sentences = [sentence for sentence in sentences if len(sentence) > 10]
# Check if the number of sentences is less than the desired number of clusters
if len(sentences) < num_clusters:
# If so, adjust the number of clusters to match the number of sentences
num_clusters = len(sentences)
# Vectorize the sentences
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)
# Perform k-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
# Get the cluster labels for each sentence
labels = kmeans.labels_
# Group sentences by cluster
clustered_sentences = [[] for _ in range(num_clusters)]
for i, label in enumerate(labels):
clustered_sentences[label].append((i, sentences[i]))
return clustered_sentences
# Main code for UI
uploaded_file = st.file_uploader("๐ Choose a .txt file", type=['txt'])
example_text = load_example_files()
if example_text:
file_text = example_text
elif uploaded_file:
file_text = uploaded_file.read().decode("utf-8")
else:
file_text = ""
if file_text:
text_without_timestamps = remove_timestamps(file_text)
sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]
top_words = extract_high_information_words(text_without_timestamps, 10)
with st.expander("๐ Top 10 High Information Words"):
st.write(top_words)
with st.expander("๐ Relationship Graph"):
display_relationship_graph(top_words)
context_words = extract_context_words(text_without_timestamps, top_words)
with st.expander("๐ Context Graph"):
display_context_graph(context_words)
with st.expander("๐ Context Table"):
display_context_table(context_words)
# with st.expander("Innovation Outlines"):
# showInnovationOutlines()
with st.expander("๐ Sentence Clustering"):
num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
clustered_sentences = cluster_sentences(sentences, num_clusters)
output_text = ""
for i, cluster in enumerate(clustered_sentences):
output_text += f"## ๐ Cluster {i+1}:\n"
for original_index, sentence in cluster:
output_text += f"- Original Line {original_index+1}: {sentence}\n"
output_text += "\n"
st.markdown(output_text)
st.markdown("https://cdn-uploads.huggingface.co/production/uploads/620630b603825909dcbeba35/Id9kntHFHZf_oFFrEmGh5.png")
|