Spaces:

pentarosarium
/

clusters

Sleeping

App Files Files Community

pentarosarium commited on Nov 28, 2024

Commit

6d4a64c

1 Parent(s): c8405c4

mend indexers

Browse files

Files changed (1) hide show

app.py +49 -62

app.py CHANGED Viewed

@@ -25,54 +25,28 @@ class NewsProcessor:
         self.similarity_threshold = similarity_threshold
         self.time_threshold = time_threshold
-    def mean_pooling(self, model_output, attention_mask):
-        token_embeddings = model_output[0]
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-    def encode_text(self, text):
-        # Convert text to string and handle NaN values
-        if pd.isna(text):
-            text = ""
-        else:
-            text = str(text)
-        encoded_input = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
-        with torch.no_grad():
-            model_output = self.model(**encoded_input)
-        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
-        return F.normalize(sentence_embeddings[0], p=2, dim=0).numpy()
-    def is_company_main_subject(self, text: str, companies: List[str]) -> Tuple[bool, str]:
-        if pd.isna(text):
-            return False, ""
-        text_lower = str(text).lower()
-        for company in companies:
-            company_lower = str(company).lower()
-            if company_lower in text_lower.split('.')[0]:
-                return True, company
-            if text_lower.count(company_lower) >= 3:
-                return True, company
-            doc = self.nlp(text_lower)
-            for sent in doc.sents:
-                if company_lower in sent.text:
-                    for token in sent:
-                        if token.dep_ == 'nsubj' and company_lower in token.text:
-                            return True, company
-        return False, ""
     def process_news(self, df: pd.DataFrame, progress_bar=None):
-        df['company_list'] = df['company'].str.split(' | ')
         df = df.sort_values('datetime')
         clusters = []
         processed = set()
-        for i, row1 in tqdm(df.iterrows(), total=len(df)):
             if i in processed:
                 continue
             cluster = [i]
             processed.add(i)
             text1_embedding = self.encode_text(row1['text'])
@@ -80,11 +54,16 @@ class NewsProcessor:
             if progress_bar:
                 progress_bar.progress(len(processed) / len(df))
                 progress_bar.text(f'Processing item {len(processed)}/{len(df)}...')
-            for j, row2 in df.iterrows():
                 if j in processed:
                     continue
                 time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
                 if abs(time_diff.total_seconds() / 3600) > self.time_threshold:
                     continue
@@ -95,6 +74,7 @@ class NewsProcessor:
                 is_main1, main_company1 = self.is_company_main_subject(row1['text'], row1['company_list'])
                 is_main2, main_company2 = self.is_company_main_subject(row2['text'], row2['company_list'])
                 companies_overlap = bool(set(row1['company_list']) & set(row2['company_list']))
                 if similarity >= self.similarity_threshold and companies_overlap:
@@ -105,24 +85,31 @@ class NewsProcessor:
         result_data = []
         for cluster_id, cluster in enumerate(clusters, 1):
-            cluster_texts = df.iloc[cluster]
-            main_companies = []
-            for _, row in cluster_texts.iterrows():
-                is_main, company = self.is_company_main_subject(row['text'], row['company_list'])
-                if is_main and company:
-                    main_companies.append(company)
-            main_company = main_companies[0] if main_companies else "Multiple/Unclear"
-            for idx in cluster:
-                result_data.append({
-                    'cluster_id': cluster_id,
-                    'datetime': df.iloc[idx]['datetime'],
-                    'company': ' | '.join(df.iloc[idx]['company_list']),
-                    'main_company': main_company,
-                    'text': df.iloc[idx]['text'],
-                    'cluster_size': len(cluster)
-                })
         return pd.DataFrame(result_data)
@@ -184,7 +171,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
-    st.title("News Clustering App")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])

         self.similarity_threshold = similarity_threshold
         self.time_threshold = time_threshold
     def process_news(self, df: pd.DataFrame, progress_bar=None):
+        # Ensure the DataFrame is not empty
+        if df.empty:
+            return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'main_company', 'text', 'cluster_size'])
+        # Create company_list safely
+        df['company_list'] = df['company'].fillna('').str.split(' | ')
         df = df.sort_values('datetime')
         clusters = []
         processed = set()
+        for i in tqdm(range(len(df)), total=len(df)):
             if i in processed:
                 continue
+            row1 = df.iloc[i]
+            if pd.isna(row1['text']) or not row1['company_list']:
+                processed.add(i)
+                clusters.append([i])
+                continue
             cluster = [i]
             processed.add(i)
             text1_embedding = self.encode_text(row1['text'])
             if progress_bar:
                 progress_bar.progress(len(processed) / len(df))
                 progress_bar.text(f'Processing item {len(processed)}/{len(df)}...')
+            # Use index-based iteration instead of iterrows
+            for j in range(len(df)):
                 if j in processed:
                     continue
+                row2 = df.iloc[j]
+                if pd.isna(row2['text']) or not row2['company_list']:
+                    continue
                 time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
                 if abs(time_diff.total_seconds() / 3600) > self.time_threshold:
                     continue
                 is_main1, main_company1 = self.is_company_main_subject(row1['text'], row1['company_list'])
                 is_main2, main_company2 = self.is_company_main_subject(row2['text'], row2['company_list'])
+                # Safe set operation
                 companies_overlap = bool(set(row1['company_list']) & set(row2['company_list']))
                 if similarity >= self.similarity_threshold and companies_overlap:
         result_data = []
         for cluster_id, cluster in enumerate(clusters, 1):
+            try:
+                cluster_texts = df.iloc[cluster]
+                main_companies = []
+                for _, row in cluster_texts.iterrows():
+                    if not pd.isna(row['text']) and isinstance(row['company_list'], list):
+                        is_main, company = self.is_company_main_subject(row['text'], row['company_list'])
+                        if is_main and company:
+                            main_companies.append(company)
+                main_company = main_companies[0] if main_companies else "Multiple/Unclear"
+                for idx in cluster:
+                    row_data = df.iloc[idx]
+                    result_data.append({
+                        'cluster_id': cluster_id,
+                        'datetime': row_data['datetime'],
+                        'company': ' | '.join(row_data['company_list']) if isinstance(row_data['company_list'], list) else '',
+                        'main_company': main_company,
+                        'text': row_data['text'],
+                        'cluster_size': len(cluster)
+                    })
+            except Exception as e:
+                print(f"Error processing cluster {cluster_id}: {str(e)}")
+                continue
         return pd.DataFrame(result_data)
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
+    st.title("кластеризуем новости v.1.2")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])