Spaces:
Sleeping
Sleeping
Commit
·
ab25eb3
1
Parent(s):
eeb3ec0
merge remote changes2
Browse files- app.py +112 -77
- requirements.txt +4 -2
app.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
-
# app.py
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
-
from huggingface_hub import HfApi, InferenceClient
|
6 |
from transformers import pipeline
|
7 |
from datetime import datetime
|
8 |
import io
|
@@ -10,84 +8,97 @@ import base64
|
|
10 |
from typing import Dict, List, Set, Tuple
|
11 |
from rapidfuzz import fuzz, process
|
12 |
from collections import defaultdict
|
13 |
-
from tqdm
|
14 |
-
|
15 |
-
# Initialize HuggingFace client with token
|
16 |
-
@st.cache_resource
|
17 |
-
def get_hf_client():
|
18 |
-
token = st.secrets["hf_token"]
|
19 |
-
return InferenceClient(token=token)
|
20 |
-
|
21 |
-
@st.cache_resource
|
22 |
-
def get_embeddings_pipeline():
|
23 |
-
return pipeline("feature-extraction",
|
24 |
-
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
25 |
-
token=st.secrets["hf_token"])
|
26 |
|
27 |
class NewsProcessor:
|
28 |
def __init__(self, similarity_threshold=0.75, time_threshold=24):
|
29 |
-
self.
|
30 |
-
self.
|
|
|
|
|
31 |
self.similarity_threshold = similarity_threshold
|
32 |
self.time_threshold = time_threshold
|
33 |
-
|
34 |
def encode_text(self, text):
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
def process_news(self, df: pd.DataFrame, progress_bar=None)
|
39 |
df['company_list'] = df['company'].str.split(' | ')
|
40 |
df = df.sort_values('datetime')
|
41 |
-
|
42 |
clusters = []
|
43 |
processed = set()
|
44 |
-
total_items = len(df)
|
45 |
|
46 |
-
for i, row1 in df.iterrows():
|
47 |
if i in processed:
|
48 |
continue
|
49 |
-
|
50 |
cluster = [i]
|
51 |
processed.add(i)
|
52 |
text1_embedding = self.encode_text(row1['text'])
|
53 |
|
54 |
if progress_bar:
|
55 |
-
progress_bar.progress(len(processed) /
|
56 |
|
57 |
for j, row2 in df.iterrows():
|
58 |
if j in processed:
|
59 |
continue
|
60 |
|
61 |
-
time_diff =
|
62 |
-
if time_diff.total_seconds() / 3600 > self.time_threshold:
|
63 |
continue
|
64 |
-
|
65 |
text2_embedding = self.encode_text(row2['text'])
|
66 |
similarity = np.dot(text1_embedding, text2_embedding)
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
74 |
clusters.append(cluster)
|
75 |
|
76 |
-
return self._create_result_df(df, clusters)
|
77 |
-
|
78 |
-
def _create_result_df(self, df: pd.DataFrame, clusters: List[List[int]]) -> pd.DataFrame:
|
79 |
result_data = []
|
80 |
for cluster_id, cluster in enumerate(clusters, 1):
|
81 |
cluster_texts = df.iloc[cluster]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
for idx in cluster:
|
83 |
result_data.append({
|
84 |
'cluster_id': cluster_id,
|
85 |
'datetime': df.iloc[idx]['datetime'],
|
86 |
'company': ' | '.join(df.iloc[idx]['company_list']),
|
|
|
87 |
'text': df.iloc[idx]['text'],
|
88 |
'cluster_size': len(cluster)
|
89 |
})
|
90 |
-
|
91 |
return pd.DataFrame(result_data)
|
92 |
|
93 |
class NewsDeduplicator:
|
@@ -99,7 +110,7 @@ class NewsDeduplicator:
|
|
99 |
text_to_companies: Dict[str, Set[str]] = defaultdict(set)
|
100 |
indices_to_keep: Set[int] = set()
|
101 |
|
102 |
-
for idx, row in df.iterrows():
|
103 |
text = str(row['text'])
|
104 |
company = str(row['company'])
|
105 |
|
@@ -113,73 +124,97 @@ class NewsDeduplicator:
|
|
113 |
match = result[0] if result else None
|
114 |
else:
|
115 |
match = None
|
116 |
-
|
117 |
if match:
|
118 |
text_to_companies[match].add(company)
|
119 |
else:
|
120 |
seen_texts.append(text)
|
121 |
text_to_companies[text].add(company)
|
122 |
indices_to_keep.add(idx)
|
123 |
-
|
124 |
if progress_bar:
|
125 |
progress_bar.progress((idx + 1) / len(df))
|
126 |
-
|
127 |
dedup_df = df.iloc[list(indices_to_keep)].copy()
|
128 |
|
129 |
for idx in indices_to_keep:
|
130 |
text = str(df.iloc[idx]['text'])
|
131 |
companies = sorted(text_to_companies[text])
|
132 |
dedup_df.at[idx, 'company'] = ' | '.join(companies)
|
|
|
|
|
133 |
|
134 |
return dedup_df.sort_values('datetime')
|
135 |
|
136 |
def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
137 |
excel_buffer = io.BytesIO()
|
138 |
-
|
|
|
139 |
excel_buffer.seek(0)
|
140 |
b64 = base64.b64encode(excel_buffer.read()).decode()
|
141 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
142 |
|
143 |
def main():
|
144 |
st.title("News Clustering App")
|
145 |
-
|
146 |
st.write("Upload Excel file with columns: company, datetime, text")
|
147 |
|
148 |
-
uploaded_file = st.file_uploader("Choose file", type=['xlsx'])
|
149 |
|
150 |
if uploaded_file:
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
with col1:
|
157 |
-
fuzzy_threshold = st.slider("Fuzzy Match Threshold", 30, 100, 50)
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
if __name__ == "__main__":
|
185 |
main()
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
|
|
4 |
from transformers import pipeline
|
5 |
from datetime import datetime
|
6 |
import io
|
|
|
8 |
from typing import Dict, List, Set, Tuple
|
9 |
from rapidfuzz import fuzz, process
|
10 |
from collections import defaultdict
|
11 |
+
from tqdm import tqdm
|
12 |
+
import ru_core_news_sm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
class NewsProcessor:
|
15 |
def __init__(self, similarity_threshold=0.75, time_threshold=24):
|
16 |
+
self.nlp = ru_core_news_sm.load()
|
17 |
+
self.embeddings = pipeline("feature-extraction",
|
18 |
+
model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
19 |
+
token=st.secrets["hf_token"])
|
20 |
self.similarity_threshold = similarity_threshold
|
21 |
self.time_threshold = time_threshold
|
22 |
+
|
23 |
def encode_text(self, text):
|
24 |
+
return np.mean(self.embeddings(text)[0], axis=0)
|
25 |
+
|
26 |
+
def is_company_main_subject(self, text: str, companies: List[str]) -> Tuple[bool, str]:
|
27 |
+
text_lower = text.lower()
|
28 |
+
for company in companies:
|
29 |
+
company_lower = company.lower()
|
30 |
+
if company_lower in text_lower.split('.')[0]:
|
31 |
+
return True, company
|
32 |
+
if text_lower.count(company_lower) >= 3:
|
33 |
+
return True, company
|
34 |
+
doc = self.nlp(text)
|
35 |
+
for sent in doc.sents:
|
36 |
+
if company_lower in sent.text.lower():
|
37 |
+
for token in sent:
|
38 |
+
if token.dep_ == 'nsubj' and company_lower in token.text.lower():
|
39 |
+
return True, company
|
40 |
+
return False, ""
|
41 |
|
42 |
+
def process_news(self, df: pd.DataFrame, progress_bar=None):
|
43 |
df['company_list'] = df['company'].str.split(' | ')
|
44 |
df = df.sort_values('datetime')
|
|
|
45 |
clusters = []
|
46 |
processed = set()
|
|
|
47 |
|
48 |
+
for i, row1 in tqdm(df.iterrows(), total=len(df)):
|
49 |
if i in processed:
|
50 |
continue
|
51 |
+
|
52 |
cluster = [i]
|
53 |
processed.add(i)
|
54 |
text1_embedding = self.encode_text(row1['text'])
|
55 |
|
56 |
if progress_bar:
|
57 |
+
progress_bar.progress(len(processed) / len(df))
|
58 |
|
59 |
for j, row2 in df.iterrows():
|
60 |
if j in processed:
|
61 |
continue
|
62 |
|
63 |
+
time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
|
64 |
+
if abs(time_diff.total_seconds() / 3600) > self.time_threshold:
|
65 |
continue
|
66 |
+
|
67 |
text2_embedding = self.encode_text(row2['text'])
|
68 |
similarity = np.dot(text1_embedding, text2_embedding)
|
69 |
|
70 |
+
is_main1, main_company1 = self.is_company_main_subject(row1['text'], row1['company_list'])
|
71 |
+
is_main2, main_company2 = self.is_company_main_subject(row2['text'], row2['company_list'])
|
72 |
+
|
73 |
+
companies_overlap = bool(set(row1['company_list']) & set(row2['company_list']))
|
74 |
+
|
75 |
+
if similarity >= self.similarity_threshold and companies_overlap:
|
76 |
+
cluster.append(j)
|
77 |
+
processed.add(j)
|
78 |
+
|
79 |
clusters.append(cluster)
|
80 |
|
|
|
|
|
|
|
81 |
result_data = []
|
82 |
for cluster_id, cluster in enumerate(clusters, 1):
|
83 |
cluster_texts = df.iloc[cluster]
|
84 |
+
main_companies = []
|
85 |
+
for _, row in cluster_texts.iterrows():
|
86 |
+
is_main, company = self.is_company_main_subject(row['text'], row['company_list'])
|
87 |
+
if is_main and company:
|
88 |
+
main_companies.append(company)
|
89 |
+
|
90 |
+
main_company = main_companies[0] if main_companies else "Multiple/Unclear"
|
91 |
+
|
92 |
for idx in cluster:
|
93 |
result_data.append({
|
94 |
'cluster_id': cluster_id,
|
95 |
'datetime': df.iloc[idx]['datetime'],
|
96 |
'company': ' | '.join(df.iloc[idx]['company_list']),
|
97 |
+
'main_company': main_company,
|
98 |
'text': df.iloc[idx]['text'],
|
99 |
'cluster_size': len(cluster)
|
100 |
})
|
101 |
+
|
102 |
return pd.DataFrame(result_data)
|
103 |
|
104 |
class NewsDeduplicator:
|
|
|
110 |
text_to_companies: Dict[str, Set[str]] = defaultdict(set)
|
111 |
indices_to_keep: Set[int] = set()
|
112 |
|
113 |
+
for idx, row in tqdm(df.iterrows(), total=len(df)):
|
114 |
text = str(row['text'])
|
115 |
company = str(row['company'])
|
116 |
|
|
|
124 |
match = result[0] if result else None
|
125 |
else:
|
126 |
match = None
|
127 |
+
|
128 |
if match:
|
129 |
text_to_companies[match].add(company)
|
130 |
else:
|
131 |
seen_texts.append(text)
|
132 |
text_to_companies[text].add(company)
|
133 |
indices_to_keep.add(idx)
|
134 |
+
|
135 |
if progress_bar:
|
136 |
progress_bar.progress((idx + 1) / len(df))
|
137 |
+
|
138 |
dedup_df = df.iloc[list(indices_to_keep)].copy()
|
139 |
|
140 |
for idx in indices_to_keep:
|
141 |
text = str(df.iloc[idx]['text'])
|
142 |
companies = sorted(text_to_companies[text])
|
143 |
dedup_df.at[idx, 'company'] = ' | '.join(companies)
|
144 |
+
dedup_df.at[idx, 'company_count'] = len(companies)
|
145 |
+
dedup_df.at[idx, 'duplicate_count'] = len(text_to_companies[text])
|
146 |
|
147 |
return dedup_df.sort_values('datetime')
|
148 |
|
149 |
def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
150 |
excel_buffer = io.BytesIO()
|
151 |
+
with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
|
152 |
+
df.to_excel(writer, index=False)
|
153 |
excel_buffer.seek(0)
|
154 |
b64 = base64.b64encode(excel_buffer.read()).decode()
|
155 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
156 |
|
157 |
def main():
|
158 |
st.title("News Clustering App")
|
|
|
159 |
st.write("Upload Excel file with columns: company, datetime, text")
|
160 |
|
161 |
+
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
162 |
|
163 |
if uploaded_file:
|
164 |
+
try:
|
165 |
+
df = pd.read_excel(uploaded_file, sheet_name='Публикации', usecols=[0,3,6])
|
166 |
+
df.columns = ['company', 'datetime', 'text']
|
167 |
+
st.success(f'Loaded {len(df)} records')
|
168 |
+
st.dataframe(df.head())
|
|
|
|
|
169 |
|
170 |
+
col1, col2 = st.columns(2)
|
171 |
+
with col1:
|
172 |
+
fuzzy_threshold = st.slider("Fuzzy Match Threshold", 30, 100, 50)
|
173 |
+
with col2:
|
174 |
+
similarity_threshold = st.slider("Similarity Threshold", 0.5, 1.0, 0.75)
|
175 |
+
time_threshold = st.slider("Time Threshold (hours)", 1, 72, 24)
|
176 |
+
|
177 |
+
if st.button("Process News"):
|
178 |
+
try:
|
179 |
+
progress_bar = st.progress(0)
|
180 |
+
|
181 |
+
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
182 |
+
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
183 |
+
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
184 |
+
|
185 |
+
st.write("Sample of deduplicated data:")
|
186 |
+
st.dataframe(dedup_df[['datetime', 'company', 'text', 'company_count', 'duplicate_count']].head())
|
187 |
+
|
188 |
+
processor = NewsProcessor(similarity_threshold, time_threshold)
|
189 |
+
result_df = processor.process_news(dedup_df, progress_bar)
|
190 |
+
st.success(f"Found {result_df['cluster_id'].nunique()} clusters")
|
191 |
+
|
192 |
+
st.subheader("Download Results")
|
193 |
+
st.markdown(create_download_link(dedup_df, "deduplicated_news.xlsx"), unsafe_allow_html=True)
|
194 |
+
st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
|
195 |
+
|
196 |
+
st.subheader("Cluster Statistics")
|
197 |
+
cluster_stats = result_df.groupby('cluster_id').agg({
|
198 |
+
'cluster_size': 'first',
|
199 |
+
'main_company': 'first',
|
200 |
+
'company': lambda x: len(set(c for companies in x for c in companies.split(' | ')))
|
201 |
+
}).rename(columns={'company': 'unique_companies'})
|
202 |
+
st.dataframe(cluster_stats)
|
203 |
+
|
204 |
+
st.subheader("Largest Clusters")
|
205 |
+
largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
|
206 |
+
['cluster_size', 'cluster_id', 'datetime'],
|
207 |
+
ascending=[False, True, True]
|
208 |
+
)
|
209 |
+
st.dataframe(largest_clusters)
|
210 |
+
|
211 |
+
except Exception as e:
|
212 |
+
st.error(f"Error: {str(e)}")
|
213 |
+
finally:
|
214 |
+
progress_bar.empty()
|
215 |
+
|
216 |
+
except Exception as e:
|
217 |
+
st.error(f"Error reading file: {str(e)}")
|
218 |
|
219 |
if __name__ == "__main__":
|
220 |
main()
|
requirements.txt
CHANGED
@@ -3,5 +3,7 @@ pandas
|
|
3 |
numpy
|
4 |
transformers
|
5 |
rapidfuzz
|
6 |
-
|
7 |
-
|
|
|
|
|
|
3 |
numpy
|
4 |
transformers
|
5 |
rapidfuzz
|
6 |
+
openpyxl
|
7 |
+
tqdm
|
8 |
+
ru-core-news-sm
|
9 |
+
spacy
|