pentarosarium commited on
Commit
ab25eb3
·
1 Parent(s): eeb3ec0

merge remote changes2

Browse files
Files changed (2) hide show
  1. app.py +112 -77
  2. requirements.txt +4 -2
app.py CHANGED
@@ -1,8 +1,6 @@
1
- # app.py
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
- from huggingface_hub import HfApi, InferenceClient
6
  from transformers import pipeline
7
  from datetime import datetime
8
  import io
@@ -10,84 +8,97 @@ import base64
10
  from typing import Dict, List, Set, Tuple
11
  from rapidfuzz import fuzz, process
12
  from collections import defaultdict
13
- from tqdm.auto import tqdm
14
-
15
- # Initialize HuggingFace client with token
16
- @st.cache_resource
17
- def get_hf_client():
18
- token = st.secrets["hf_token"]
19
- return InferenceClient(token=token)
20
-
21
- @st.cache_resource
22
- def get_embeddings_pipeline():
23
- return pipeline("feature-extraction",
24
- model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
25
- token=st.secrets["hf_token"])
26
 
27
  class NewsProcessor:
28
  def __init__(self, similarity_threshold=0.75, time_threshold=24):
29
- self.client = get_hf_client()
30
- self.embeddings_pipeline = get_embeddings_pipeline()
 
 
31
  self.similarity_threshold = similarity_threshold
32
  self.time_threshold = time_threshold
33
-
34
  def encode_text(self, text):
35
- embeddings = self.embeddings_pipeline(text)
36
- return np.mean(embeddings[0], axis=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- def process_news(self, df: pd.DataFrame, progress_bar=None) -> pd.DataFrame:
39
  df['company_list'] = df['company'].str.split(' | ')
40
  df = df.sort_values('datetime')
41
-
42
  clusters = []
43
  processed = set()
44
- total_items = len(df)
45
 
46
- for i, row1 in df.iterrows():
47
  if i in processed:
48
  continue
49
-
50
  cluster = [i]
51
  processed.add(i)
52
  text1_embedding = self.encode_text(row1['text'])
53
 
54
  if progress_bar:
55
- progress_bar.progress(len(processed) / total_items)
56
 
57
  for j, row2 in df.iterrows():
58
  if j in processed:
59
  continue
60
 
61
- time_diff = abs(pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime']))
62
- if time_diff.total_seconds() / 3600 > self.time_threshold:
63
  continue
64
-
65
  text2_embedding = self.encode_text(row2['text'])
66
  similarity = np.dot(text1_embedding, text2_embedding)
67
 
68
- if similarity >= self.similarity_threshold:
69
- companies_overlap = bool(set(row1['company_list']) & set(row2['company_list']))
70
- if companies_overlap:
71
- cluster.append(j)
72
- processed.add(j)
73
-
 
 
 
74
  clusters.append(cluster)
75
 
76
- return self._create_result_df(df, clusters)
77
-
78
- def _create_result_df(self, df: pd.DataFrame, clusters: List[List[int]]) -> pd.DataFrame:
79
  result_data = []
80
  for cluster_id, cluster in enumerate(clusters, 1):
81
  cluster_texts = df.iloc[cluster]
 
 
 
 
 
 
 
 
82
  for idx in cluster:
83
  result_data.append({
84
  'cluster_id': cluster_id,
85
  'datetime': df.iloc[idx]['datetime'],
86
  'company': ' | '.join(df.iloc[idx]['company_list']),
 
87
  'text': df.iloc[idx]['text'],
88
  'cluster_size': len(cluster)
89
  })
90
-
91
  return pd.DataFrame(result_data)
92
 
93
  class NewsDeduplicator:
@@ -99,7 +110,7 @@ class NewsDeduplicator:
99
  text_to_companies: Dict[str, Set[str]] = defaultdict(set)
100
  indices_to_keep: Set[int] = set()
101
 
102
- for idx, row in df.iterrows():
103
  text = str(row['text'])
104
  company = str(row['company'])
105
 
@@ -113,73 +124,97 @@ class NewsDeduplicator:
113
  match = result[0] if result else None
114
  else:
115
  match = None
116
-
117
  if match:
118
  text_to_companies[match].add(company)
119
  else:
120
  seen_texts.append(text)
121
  text_to_companies[text].add(company)
122
  indices_to_keep.add(idx)
123
-
124
  if progress_bar:
125
  progress_bar.progress((idx + 1) / len(df))
126
-
127
  dedup_df = df.iloc[list(indices_to_keep)].copy()
128
 
129
  for idx in indices_to_keep:
130
  text = str(df.iloc[idx]['text'])
131
  companies = sorted(text_to_companies[text])
132
  dedup_df.at[idx, 'company'] = ' | '.join(companies)
 
 
133
 
134
  return dedup_df.sort_values('datetime')
135
 
136
  def create_download_link(df: pd.DataFrame, filename: str) -> str:
137
  excel_buffer = io.BytesIO()
138
- df.to_excel(excel_buffer, index=False)
 
139
  excel_buffer.seek(0)
140
  b64 = base64.b64encode(excel_buffer.read()).decode()
141
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
142
 
143
  def main():
144
  st.title("News Clustering App")
145
-
146
  st.write("Upload Excel file with columns: company, datetime, text")
147
 
148
- uploaded_file = st.file_uploader("Choose file", type=['xlsx'])
149
 
150
  if uploaded_file:
151
- df = pd.read_excel(uploaded_file)
152
- st.dataframe(df.head())
153
-
154
- col1, col2 = st.columns(2)
155
-
156
- with col1:
157
- fuzzy_threshold = st.slider("Fuzzy Match Threshold", 30, 100, 50)
158
 
159
- with col2:
160
- similarity_threshold = st.slider("Similarity Threshold", 0.5, 1.0, 0.75)
161
- time_threshold = st.slider("Time Threshold (hours)", 1, 72, 24)
162
-
163
- if st.button("Process"):
164
- try:
165
- progress_bar = st.progress(0)
166
-
167
- deduplicator = NewsDeduplicator(fuzzy_threshold)
168
- dedup_df = deduplicator.deduplicate(df, progress_bar)
169
- st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
170
-
171
- processor = NewsProcessor(similarity_threshold, time_threshold)
172
- result_df = processor.process_news(dedup_df, progress_bar)
173
- st.success(f"Found {result_df['cluster_id'].nunique()} clusters")
174
-
175
- st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
176
-
177
- st.dataframe(result_df)
178
-
179
- except Exception as e:
180
- st.error(f"Error: {str(e)}")
181
- finally:
182
- progress_bar.empty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  if __name__ == "__main__":
185
  main()
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
 
4
  from transformers import pipeline
5
  from datetime import datetime
6
  import io
 
8
  from typing import Dict, List, Set, Tuple
9
  from rapidfuzz import fuzz, process
10
  from collections import defaultdict
11
+ from tqdm import tqdm
12
+ import ru_core_news_sm
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  class NewsProcessor:
15
  def __init__(self, similarity_threshold=0.75, time_threshold=24):
16
+ self.nlp = ru_core_news_sm.load()
17
+ self.embeddings = pipeline("feature-extraction",
18
+ model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
19
+ token=st.secrets["hf_token"])
20
  self.similarity_threshold = similarity_threshold
21
  self.time_threshold = time_threshold
22
+
23
  def encode_text(self, text):
24
+ return np.mean(self.embeddings(text)[0], axis=0)
25
+
26
+ def is_company_main_subject(self, text: str, companies: List[str]) -> Tuple[bool, str]:
27
+ text_lower = text.lower()
28
+ for company in companies:
29
+ company_lower = company.lower()
30
+ if company_lower in text_lower.split('.')[0]:
31
+ return True, company
32
+ if text_lower.count(company_lower) >= 3:
33
+ return True, company
34
+ doc = self.nlp(text)
35
+ for sent in doc.sents:
36
+ if company_lower in sent.text.lower():
37
+ for token in sent:
38
+ if token.dep_ == 'nsubj' and company_lower in token.text.lower():
39
+ return True, company
40
+ return False, ""
41
 
42
+ def process_news(self, df: pd.DataFrame, progress_bar=None):
43
  df['company_list'] = df['company'].str.split(' | ')
44
  df = df.sort_values('datetime')
 
45
  clusters = []
46
  processed = set()
 
47
 
48
+ for i, row1 in tqdm(df.iterrows(), total=len(df)):
49
  if i in processed:
50
  continue
51
+
52
  cluster = [i]
53
  processed.add(i)
54
  text1_embedding = self.encode_text(row1['text'])
55
 
56
  if progress_bar:
57
+ progress_bar.progress(len(processed) / len(df))
58
 
59
  for j, row2 in df.iterrows():
60
  if j in processed:
61
  continue
62
 
63
+ time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
64
+ if abs(time_diff.total_seconds() / 3600) > self.time_threshold:
65
  continue
66
+
67
  text2_embedding = self.encode_text(row2['text'])
68
  similarity = np.dot(text1_embedding, text2_embedding)
69
 
70
+ is_main1, main_company1 = self.is_company_main_subject(row1['text'], row1['company_list'])
71
+ is_main2, main_company2 = self.is_company_main_subject(row2['text'], row2['company_list'])
72
+
73
+ companies_overlap = bool(set(row1['company_list']) & set(row2['company_list']))
74
+
75
+ if similarity >= self.similarity_threshold and companies_overlap:
76
+ cluster.append(j)
77
+ processed.add(j)
78
+
79
  clusters.append(cluster)
80
 
 
 
 
81
  result_data = []
82
  for cluster_id, cluster in enumerate(clusters, 1):
83
  cluster_texts = df.iloc[cluster]
84
+ main_companies = []
85
+ for _, row in cluster_texts.iterrows():
86
+ is_main, company = self.is_company_main_subject(row['text'], row['company_list'])
87
+ if is_main and company:
88
+ main_companies.append(company)
89
+
90
+ main_company = main_companies[0] if main_companies else "Multiple/Unclear"
91
+
92
  for idx in cluster:
93
  result_data.append({
94
  'cluster_id': cluster_id,
95
  'datetime': df.iloc[idx]['datetime'],
96
  'company': ' | '.join(df.iloc[idx]['company_list']),
97
+ 'main_company': main_company,
98
  'text': df.iloc[idx]['text'],
99
  'cluster_size': len(cluster)
100
  })
101
+
102
  return pd.DataFrame(result_data)
103
 
104
  class NewsDeduplicator:
 
110
  text_to_companies: Dict[str, Set[str]] = defaultdict(set)
111
  indices_to_keep: Set[int] = set()
112
 
113
+ for idx, row in tqdm(df.iterrows(), total=len(df)):
114
  text = str(row['text'])
115
  company = str(row['company'])
116
 
 
124
  match = result[0] if result else None
125
  else:
126
  match = None
127
+
128
  if match:
129
  text_to_companies[match].add(company)
130
  else:
131
  seen_texts.append(text)
132
  text_to_companies[text].add(company)
133
  indices_to_keep.add(idx)
134
+
135
  if progress_bar:
136
  progress_bar.progress((idx + 1) / len(df))
137
+
138
  dedup_df = df.iloc[list(indices_to_keep)].copy()
139
 
140
  for idx in indices_to_keep:
141
  text = str(df.iloc[idx]['text'])
142
  companies = sorted(text_to_companies[text])
143
  dedup_df.at[idx, 'company'] = ' | '.join(companies)
144
+ dedup_df.at[idx, 'company_count'] = len(companies)
145
+ dedup_df.at[idx, 'duplicate_count'] = len(text_to_companies[text])
146
 
147
  return dedup_df.sort_values('datetime')
148
 
149
  def create_download_link(df: pd.DataFrame, filename: str) -> str:
150
  excel_buffer = io.BytesIO()
151
+ with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
152
+ df.to_excel(writer, index=False)
153
  excel_buffer.seek(0)
154
  b64 = base64.b64encode(excel_buffer.read()).decode()
155
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
156
 
157
  def main():
158
  st.title("News Clustering App")
 
159
  st.write("Upload Excel file with columns: company, datetime, text")
160
 
161
+ uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
162
 
163
  if uploaded_file:
164
+ try:
165
+ df = pd.read_excel(uploaded_file, sheet_name='Публикации', usecols=[0,3,6])
166
+ df.columns = ['company', 'datetime', 'text']
167
+ st.success(f'Loaded {len(df)} records')
168
+ st.dataframe(df.head())
 
 
169
 
170
+ col1, col2 = st.columns(2)
171
+ with col1:
172
+ fuzzy_threshold = st.slider("Fuzzy Match Threshold", 30, 100, 50)
173
+ with col2:
174
+ similarity_threshold = st.slider("Similarity Threshold", 0.5, 1.0, 0.75)
175
+ time_threshold = st.slider("Time Threshold (hours)", 1, 72, 24)
176
+
177
+ if st.button("Process News"):
178
+ try:
179
+ progress_bar = st.progress(0)
180
+
181
+ deduplicator = NewsDeduplicator(fuzzy_threshold)
182
+ dedup_df = deduplicator.deduplicate(df, progress_bar)
183
+ st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
184
+
185
+ st.write("Sample of deduplicated data:")
186
+ st.dataframe(dedup_df[['datetime', 'company', 'text', 'company_count', 'duplicate_count']].head())
187
+
188
+ processor = NewsProcessor(similarity_threshold, time_threshold)
189
+ result_df = processor.process_news(dedup_df, progress_bar)
190
+ st.success(f"Found {result_df['cluster_id'].nunique()} clusters")
191
+
192
+ st.subheader("Download Results")
193
+ st.markdown(create_download_link(dedup_df, "deduplicated_news.xlsx"), unsafe_allow_html=True)
194
+ st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
195
+
196
+ st.subheader("Cluster Statistics")
197
+ cluster_stats = result_df.groupby('cluster_id').agg({
198
+ 'cluster_size': 'first',
199
+ 'main_company': 'first',
200
+ 'company': lambda x: len(set(c for companies in x for c in companies.split(' | ')))
201
+ }).rename(columns={'company': 'unique_companies'})
202
+ st.dataframe(cluster_stats)
203
+
204
+ st.subheader("Largest Clusters")
205
+ largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
206
+ ['cluster_size', 'cluster_id', 'datetime'],
207
+ ascending=[False, True, True]
208
+ )
209
+ st.dataframe(largest_clusters)
210
+
211
+ except Exception as e:
212
+ st.error(f"Error: {str(e)}")
213
+ finally:
214
+ progress_bar.empty()
215
+
216
+ except Exception as e:
217
+ st.error(f"Error reading file: {str(e)}")
218
 
219
  if __name__ == "__main__":
220
  main()
requirements.txt CHANGED
@@ -3,5 +3,7 @@ pandas
3
  numpy
4
  transformers
5
  rapidfuzz
6
- huggingface-hub
7
- openpyxl
 
 
 
3
  numpy
4
  transformers
5
  rapidfuzz
6
+ openpyxl
7
+ tqdm
8
+ ru-core-news-sm
9
+ spacy