Spaces:

pentarosarium
/

clusters

Sleeping

App Files Files Community

pentarosarium commited on Nov 18, 2024

Commit

ab25eb3

1 Parent(s): eeb3ec0

merge remote changes2

Browse files

Files changed (2) hide show

app.py +112 -77
requirements.txt +4 -2

app.py CHANGED Viewed

@@ -1,8 +1,6 @@
-# app.py
 import streamlit as st
 import pandas as pd
 import numpy as np
-from huggingface_hub import HfApi, InferenceClient
 from transformers import pipeline
 from datetime import datetime
 import io
@@ -10,84 +8,97 @@ import base64
 from typing import Dict, List, Set, Tuple
 from rapidfuzz import fuzz, process
 from collections import defaultdict
-from tqdm.auto import tqdm
-# Initialize HuggingFace client with token
-@st.cache_resource
-def get_hf_client():
-    token = st.secrets["hf_token"]
-    return InferenceClient(token=token)
-@st.cache_resource
-def get_embeddings_pipeline():
-    return pipeline("feature-extraction",
-                   model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
-                   token=st.secrets["hf_token"])
 class NewsProcessor:
     def __init__(self, similarity_threshold=0.75, time_threshold=24):
-        self.client = get_hf_client()
-        self.embeddings_pipeline = get_embeddings_pipeline()
         self.similarity_threshold = similarity_threshold
         self.time_threshold = time_threshold
     def encode_text(self, text):
-        embeddings = self.embeddings_pipeline(text)
-        return np.mean(embeddings[0], axis=0)
-    def process_news(self, df: pd.DataFrame, progress_bar=None) -> pd.DataFrame:
         df['company_list'] = df['company'].str.split(' | ')
         df = df.sort_values('datetime')
         clusters = []
         processed = set()
-        total_items = len(df)
-        for i, row1 in df.iterrows():
             if i in processed:
                 continue
             cluster = [i]
             processed.add(i)
             text1_embedding = self.encode_text(row1['text'])
             if progress_bar:
-                progress_bar.progress(len(processed) / total_items)
             for j, row2 in df.iterrows():
                 if j in processed:
                     continue
-                time_diff = abs(pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime']))
-                if time_diff.total_seconds() / 3600 > self.time_threshold:
                     continue
                 text2_embedding = self.encode_text(row2['text'])
                 similarity = np.dot(text1_embedding, text2_embedding)
-                if similarity >= self.similarity_threshold:
-                    companies_overlap = bool(set(row1['company_list']) & set(row2['company_list']))
-                    if companies_overlap:
-                        cluster.append(j)
-                        processed.add(j)
             clusters.append(cluster)
-        return self._create_result_df(df, clusters)
-    def _create_result_df(self, df: pd.DataFrame, clusters: List[List[int]]) -> pd.DataFrame:
         result_data = []
         for cluster_id, cluster in enumerate(clusters, 1):
             cluster_texts = df.iloc[cluster]
             for idx in cluster:
                 result_data.append({
                     'cluster_id': cluster_id,
                     'datetime': df.iloc[idx]['datetime'],
                     'company': ' | '.join(df.iloc[idx]['company_list']),
                     'text': df.iloc[idx]['text'],
                     'cluster_size': len(cluster)
                 })
         return pd.DataFrame(result_data)
 class NewsDeduplicator:
@@ -99,7 +110,7 @@ class NewsDeduplicator:
         text_to_companies: Dict[str, Set[str]] = defaultdict(set)
         indices_to_keep: Set[int] = set()
-        for idx, row in df.iterrows():
             text = str(row['text'])
             company = str(row['company'])
@@ -113,73 +124,97 @@ class NewsDeduplicator:
                 match = result[0] if result else None
             else:
                 match = None
             if match:
                 text_to_companies[match].add(company)
             else:
                 seen_texts.append(text)
                 text_to_companies[text].add(company)
                 indices_to_keep.add(idx)
             if progress_bar:
                 progress_bar.progress((idx + 1) / len(df))
         dedup_df = df.iloc[list(indices_to_keep)].copy()
         for idx in indices_to_keep:
             text = str(df.iloc[idx]['text'])
             companies = sorted(text_to_companies[text])
             dedup_df.at[idx, 'company'] = ' | '.join(companies)
         return dedup_df.sort_values('datetime')
 def create_download_link(df: pd.DataFrame, filename: str) -> str:
     excel_buffer = io.BytesIO()
-    df.to_excel(excel_buffer, index=False)
     excel_buffer.seek(0)
     b64 = base64.b64encode(excel_buffer.read()).decode()
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
     st.title("News Clustering App")
     st.write("Upload Excel file with columns: company, datetime, text")
-    uploaded_file = st.file_uploader("Choose file", type=['xlsx'])
     if uploaded_file:
-        df = pd.read_excel(uploaded_file)
-        st.dataframe(df.head())
-        col1, col2 = st.columns(2)
-        with col1:
-            fuzzy_threshold = st.slider("Fuzzy Match Threshold", 30, 100, 50)
-        with col2:
-            similarity_threshold = st.slider("Similarity Threshold", 0.5, 1.0, 0.75)
-            time_threshold = st.slider("Time Threshold (hours)", 1, 72, 24)
-        if st.button("Process"):
-            try:
-                progress_bar = st.progress(0)
-                deduplicator = NewsDeduplicator(fuzzy_threshold)
-                dedup_df = deduplicator.deduplicate(df, progress_bar)
-                st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
-                processor = NewsProcessor(similarity_threshold, time_threshold)
-                result_df = processor.process_news(dedup_df, progress_bar)
-                st.success(f"Found {result_df['cluster_id'].nunique()} clusters")
-                st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
-                st.dataframe(result_df)
-            except Exception as e:
-                st.error(f"Error: {str(e)}")
-            finally:
-                progress_bar.empty()
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
 import numpy as np
 from transformers import pipeline
 from datetime import datetime
 import io
 from typing import Dict, List, Set, Tuple
 from rapidfuzz import fuzz, process
 from collections import defaultdict
+from tqdm import tqdm
+import ru_core_news_sm
 class NewsProcessor:
     def __init__(self, similarity_threshold=0.75, time_threshold=24):
+        self.nlp = ru_core_news_sm.load()
+        self.embeddings = pipeline("feature-extraction",
+                                 model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
+                                 token=st.secrets["hf_token"])
         self.similarity_threshold = similarity_threshold
         self.time_threshold = time_threshold
     def encode_text(self, text):
+        return np.mean(self.embeddings(text)[0], axis=0)
+    def is_company_main_subject(self, text: str, companies: List[str]) -> Tuple[bool, str]:
+        text_lower = text.lower()
+        for company in companies:
+            company_lower = company.lower()
+            if company_lower in text_lower.split('.')[0]:
+                return True, company
+            if text_lower.count(company_lower) >= 3:
+                return True, company
+            doc = self.nlp(text)
+            for sent in doc.sents:
+                if company_lower in sent.text.lower():
+                    for token in sent:
+                        if token.dep_ == 'nsubj' and company_lower in token.text.lower():
+                            return True, company
+        return False, ""
+    def process_news(self, df: pd.DataFrame, progress_bar=None):
         df['company_list'] = df['company'].str.split(' | ')
         df = df.sort_values('datetime')
         clusters = []
         processed = set()
+        for i, row1 in tqdm(df.iterrows(), total=len(df)):
             if i in processed:
                 continue
             cluster = [i]
             processed.add(i)
             text1_embedding = self.encode_text(row1['text'])
             if progress_bar:
+                progress_bar.progress(len(processed) / len(df))
             for j, row2 in df.iterrows():
                 if j in processed:
                     continue
+                time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
+                if abs(time_diff.total_seconds() / 3600) > self.time_threshold:
                     continue
                 text2_embedding = self.encode_text(row2['text'])
                 similarity = np.dot(text1_embedding, text2_embedding)
+                is_main1, main_company1 = self.is_company_main_subject(row1['text'], row1['company_list'])
+                is_main2, main_company2 = self.is_company_main_subject(row2['text'], row2['company_list'])
+                companies_overlap = bool(set(row1['company_list']) & set(row2['company_list']))
+                if similarity >= self.similarity_threshold and companies_overlap:
+                    cluster.append(j)
+                    processed.add(j)
             clusters.append(cluster)
         result_data = []
         for cluster_id, cluster in enumerate(clusters, 1):
             cluster_texts = df.iloc[cluster]
+            main_companies = []
+            for _, row in cluster_texts.iterrows():
+                is_main, company = self.is_company_main_subject(row['text'], row['company_list'])
+                if is_main and company:
+                    main_companies.append(company)
+            main_company = main_companies[0] if main_companies else "Multiple/Unclear"
             for idx in cluster:
                 result_data.append({
                     'cluster_id': cluster_id,
                     'datetime': df.iloc[idx]['datetime'],
                     'company': ' | '.join(df.iloc[idx]['company_list']),
+                    'main_company': main_company,
                     'text': df.iloc[idx]['text'],
                     'cluster_size': len(cluster)
                 })
         return pd.DataFrame(result_data)
 class NewsDeduplicator:
         text_to_companies: Dict[str, Set[str]] = defaultdict(set)
         indices_to_keep: Set[int] = set()
+        for idx, row in tqdm(df.iterrows(), total=len(df)):
             text = str(row['text'])
             company = str(row['company'])
                 match = result[0] if result else None
             else:
                 match = None
             if match:
                 text_to_companies[match].add(company)
             else:
                 seen_texts.append(text)
                 text_to_companies[text].add(company)
                 indices_to_keep.add(idx)
             if progress_bar:
                 progress_bar.progress((idx + 1) / len(df))
         dedup_df = df.iloc[list(indices_to_keep)].copy()
         for idx in indices_to_keep:
             text = str(df.iloc[idx]['text'])
             companies = sorted(text_to_companies[text])
             dedup_df.at[idx, 'company'] = ' | '.join(companies)
+            dedup_df.at[idx, 'company_count'] = len(companies)
+            dedup_df.at[idx, 'duplicate_count'] = len(text_to_companies[text])
         return dedup_df.sort_values('datetime')
 def create_download_link(df: pd.DataFrame, filename: str) -> str:
     excel_buffer = io.BytesIO()
+    with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
+        df.to_excel(writer, index=False)
     excel_buffer.seek(0)
     b64 = base64.b64encode(excel_buffer.read()).decode()
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
     st.title("News Clustering App")
     st.write("Upload Excel file with columns: company, datetime, text")
+    uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
     if uploaded_file:
+        try:
+            df = pd.read_excel(uploaded_file, sheet_name='Публикации', usecols=[0,3,6])
+            df.columns = ['company', 'datetime', 'text']
+            st.success(f'Loaded {len(df)} records')
+            st.dataframe(df.head())
+            col1, col2 = st.columns(2)
+            with col1:
+                fuzzy_threshold = st.slider("Fuzzy Match Threshold", 30, 100, 50)
+            with col2:
+                similarity_threshold = st.slider("Similarity Threshold", 0.5, 1.0, 0.75)
+                time_threshold = st.slider("Time Threshold (hours)", 1, 72, 24)
+            if st.button("Process News"):
+                try:
+                    progress_bar = st.progress(0)
+                    deduplicator = NewsDeduplicator(fuzzy_threshold)
+                    dedup_df = deduplicator.deduplicate(df, progress_bar)
+                    st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
+                    st.write("Sample of deduplicated data:")
+                    st.dataframe(dedup_df[['datetime', 'company', 'text', 'company_count', 'duplicate_count']].head())
+                    processor = NewsProcessor(similarity_threshold, time_threshold)
+                    result_df = processor.process_news(dedup_df, progress_bar)
+                    st.success(f"Found {result_df['cluster_id'].nunique()} clusters")
+                    st.subheader("Download Results")
+                    st.markdown(create_download_link(dedup_df, "deduplicated_news.xlsx"), unsafe_allow_html=True)
+                    st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
+                    st.subheader("Cluster Statistics")
+                    cluster_stats = result_df.groupby('cluster_id').agg({
+                        'cluster_size': 'first',
+                        'main_company': 'first',
+                        'company': lambda x: len(set(c for companies in x for c in companies.split(' | ')))
+                    }).rename(columns={'company': 'unique_companies'})
+                    st.dataframe(cluster_stats)
+                    st.subheader("Largest Clusters")
+                    largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
+                        ['cluster_size', 'cluster_id', 'datetime'],
+                        ascending=[False, True, True]
+                    )
+                    st.dataframe(largest_clusters)
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+                finally:
+                    progress_bar.empty()
+        except Exception as e:
+            st.error(f"Error reading file: {str(e)}")
 if __name__ == "__main__":
     main()

requirements.txt CHANGED Viewed

@@ -3,5 +3,7 @@ pandas
 numpy
 transformers
 rapidfuzz
-huggingface-hub
-openpyxl

 numpy
 transformers
 rapidfuzz
+openpyxl
+tqdm
+ru-core-news-sm
+spacy