awacke1 commited on
Commit
d44e2e0
·
1 Parent(s): e206bb2

Create backup.app.py

Browse files
Files changed (1) hide show
  1. backup.app.py +69 -0
backup.app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import re
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk import FreqDist
6
+ from graphviz import Digraph
7
+
8
+ nltk.download('punkt')
9
+ nltk.download('stopwords')
10
+
11
+ def remove_timestamps(text):
12
+ return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text) # Updated regex pattern
13
+
14
+ def process_text(text):
15
+ lines = text.split("\n")
16
+ processed_lines = []
17
+
18
+ for line in lines:
19
+ if line:
20
+ processed_lines.append(line)
21
+
22
+ outline = ""
23
+ for i, line in enumerate(processed_lines):
24
+ if i % 2 == 0:
25
+ outline += f"**{line}**\n"
26
+ else:
27
+ outline += f"- {line} 😄\n"
28
+
29
+ return outline
30
+
31
+ def extract_high_information_words(text, top_n=10):
32
+ words = nltk.word_tokenize(text)
33
+ words = [word.lower() for word in words if word.isalpha()]
34
+
35
+ stop_words = set(stopwords.words('english'))
36
+ filtered_words = [word for word in words if word not in stop_words]
37
+
38
+ freq_dist = FreqDist(filtered_words)
39
+ high_information_words = [word for word, _ in freq_dist.most_common(top_n)]
40
+
41
+ return high_information_words
42
+
43
+ def create_relationship_graph(words):
44
+ graph = Digraph()
45
+
46
+ for index, word in enumerate(words):
47
+ graph.node(str(index), word)
48
+
49
+ if index > 0:
50
+ graph.edge(str(index - 1), str(index), label=str(index))
51
+
52
+ return graph
53
+
54
+ def display_relationship_graph(words):
55
+ graph = create_relationship_graph(words)
56
+ st.graphviz_chart(graph)
57
+
58
+ uploaded_file = st.file_uploader("Choose a .txt file", type=['txt'])
59
+
60
+ if uploaded_file:
61
+ file_text = uploaded_file.read().decode("utf-8")
62
+ text_without_timestamps = remove_timestamps(file_text)
63
+
64
+ top_words = extract_high_information_words(text_without_timestamps, 10)
65
+ st.markdown("**Top 10 High Information Words:**")
66
+ st.write(top_words)
67
+
68
+ st.markdown("**Relationship Graph:**")
69
+ display_relationship_graph(top_words)