Spaces:

pentarosarium
/

clusters

Sleeping

App Files Files Community

pentarosarium commited on Nov 29, 2024

Commit

d32e04e

1 Parent(s): cbb8180

1.4

Browse files

Files changed (1) hide show

app.py +153 -99

app.py CHANGED Viewed

@@ -25,129 +25,183 @@ class NewsProcessor:
         self.similarity_threshold = similarity_threshold
         self.time_threshold = time_threshold
-    def mean_pooling(self, model_output, attention_mask):
-        token_embeddings = model_output[0]
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-    def encode_text(self, text):
-        # Convert text to string and handle NaN values
-        if pd.isna(text):
-            text = ""
-        else:
-            text = str(text)
-        encoded_input = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
-        with torch.no_grad():
-            model_output = self.model(**encoded_input)
-        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
-        return F.normalize(sentence_embeddings[0], p=2, dim=0).numpy()
-    def is_company_main_subject(self, text: str, companies: List[str]) -> Tuple[bool, str]:
-        if pd.isna(text):
-            return False, ""
-        text_lower = str(text).lower()
-        for company in companies:
-            company_lower = str(company).lower()
-            if company_lower in text_lower.split('.')[0]:
-                return True, company
-            if text_lower.count(company_lower) >= 3:
-                return True, company
-            doc = self.nlp(text_lower)
-            for sent in doc.sents:
-                if company_lower in sent.text:
                     for token in sent:
-                        if token.dep_ == 'nsubj' and company_lower in token.text:
-                            return True, company
-        return False, ""
     def process_news(self, df: pd.DataFrame, progress_bar=None):
         # Ensure the DataFrame is not empty
         if df.empty:
-            return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'main_company', 'text', 'cluster_size'])
-        # Create company_list safely
-        df['company_list'] = df['company'].fillna('').str.split(' | ')
         df = df.sort_values('datetime')
         clusters = []
         processed = set()
-        for i in tqdm(range(len(df)), total=len(df)):
             if i in processed:
                 continue
-            row1 = df.iloc[i]
-            if pd.isna(row1['text']) or not row1['company_list']:
-                processed.add(i)
-                clusters.append([i])
-                continue
-            cluster = [i]
             processed.add(i)
-            text1_embedding = self.encode_text(row1['text'])
-            if progress_bar:
-                progress_bar.progress(len(processed) / len(df))
-                progress_bar.text(f'Processing item {len(processed)}/{len(df)}...')
-            for j in range(len(df)):
-                if j in processed:
-                    continue
-                row2 = df.iloc[j]
-                if pd.isna(row2['text']) or not row2['company_list']:
-                    continue
-                time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
-                if abs(time_diff.total_seconds() / 3600) > self.time_threshold:
-                    continue
-                text2_embedding = self.encode_text(row2['text'])
-                similarity = np.dot(text1_embedding, text2_embedding)
-                is_main1, main_company1 = self.is_company_main_subject(row1['text'], row1['company_list'])
-                is_main2, main_company2 = self.is_company_main_subject(row2['text'], row2['company_list'])
-                # Safe set operation
-                companies_overlap = bool(set(row1['company_list']) & set(row2['company_list']))
-                if similarity >= self.similarity_threshold and companies_overlap:
-                    cluster.append(j)
-                    processed.add(j)
-            clusters.append(cluster)
         result_data = []
-        for cluster_id, cluster in enumerate(clusters, 1):
-            try:
-                cluster_texts = df.iloc[cluster]
-                main_companies = []
-                for _, row in cluster_texts.iterrows():
-                    if not pd.isna(row['text']) and isinstance(row['company_list'], list):
-                        is_main, company = self.is_company_main_subject(row['text'], row['company_list'])
-                        if is_main and company:
-                            main_companies.append(company)
-                main_company = main_companies[0] if main_companies else "Multiple/Unclear"
-                for idx in cluster:
-                    row_data = df.iloc[idx]
-                    result_data.append({
-                        'cluster_id': cluster_id,
-                        'datetime': row_data['datetime'],
-                        'company': ' | '.join(row_data['company_list']) if isinstance(row_data['company_list'], list) else '',
-                        'main_company': main_company,
-                        'text': row_data['text'],
-                        'cluster_size': len(cluster)
-                    })
-            except Exception as e:
-                print(f"Error processing cluster {cluster_id}: {str(e)}")
-                continue
         return pd.DataFrame(result_data)
 class NewsDeduplicator:
@@ -208,7 +262,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
-    st.title("кластеризуем новости v.1.3+")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])

         self.similarity_threshold = similarity_threshold
         self.time_threshold = time_threshold
+    def preprocess_company_name(self, company_name: str) -> List[str]:
+        """
+        Preprocesses company name to create search patterns.
+        Handles cases with commas, quotes, and multiple words.
+        Returns key identifiable parts of the company name.
+        """
+        if pd.isna(company_name):
+            return []
+        # Remove quotes and extra spaces
+        name = str(company_name).strip('"\'').strip()
+        # Split by comma and take the first part (usually the main name)
+        main_name = name.split(',')[0].strip()
+        # Create patterns from significant parts of the name
+        patterns = []
+        # Add full name
+        patterns.append(main_name.lower())
+        # Add significant words (usually 3+ characters)
+        words = [w for w in main_name.split() if len(w) >= 3]
+        if len(words) > 1:
+            # Add the first significant word if it's not a common word
+            patterns.append(words[0].lower())
+            # Add combinations of consecutive words
+            for i in range(len(words)-1):
+                patterns.append(f"{words[i]} {words[i+1]}".lower())
+        return list(set(patterns))
+    def is_company_main_subject(self, text: str, company_name: str) -> Tuple[bool, float]:
+        """
+        Determines if the company is the main subject of the news.
+        Returns (is_main_subject, relevance_score).
+        """
+        if pd.isna(text) or pd.isna(company_name):
+            return False, 0.0
+        text = str(text).lower()
+        # Get company name patterns
+        company_patterns = self.preprocess_company_name(company_name)
+        if not company_patterns:
+            return False, 0.0
+        doc = self.nlp(text)
+        # Initialize metrics
+        mentions_count = 0
+        is_in_first_sentence = False
+        is_subject = False
+        other_companies_count = 0
+        # Check first sentence
+        first_sent = next(doc.sents)
+        first_sent_text = first_sent.text.lower()
+        for pattern in company_patterns:
+            if pattern in first_sent_text:
+                is_in_first_sentence = True
+                break
+        # Analyze each sentence
+        for sent in doc.sents:
+            sent_text = sent.text.lower()
+            # Count company mentions
+            for pattern in company_patterns:
+                if pattern in sent_text:
+                    mentions_count += 1
+                    # Check if company is subject
                     for token in sent:
+                        if pattern in token.text.lower() and token.dep_ in ['nsubj', 'nsubjpass']:
+                            is_subject = True
+            # Count potential other company mentions
+            # This is a simplified approach - could be improved with named entity recognition
+            company_indicators = ['компания', 'корпорация', 'фирма', 'банк', 'group', 'inc', 'ltd', 'llc', 'corporation']
+            for indicator in company_indicators:
+                if indicator in sent_text:
+                    other_companies_count += 1
+        # Calculate relevance score
+        relevance_score = 0.0
+        relevance_score += 0.4 if is_in_first_sentence else 0.0
+        relevance_score += 0.3 if is_subject else 0.0
+        relevance_score += min(0.3, mentions_count * 0.1)  # Cap at 0.3
+        # Reduce score if many other companies are mentioned
+        relevance_score *= max(0.2, 1 - (other_companies_count * 0.1))
+        # Company is considered main subject if score is above threshold
+        return relevance_score >= 0.5, relevance_score
     def process_news(self, df: pd.DataFrame, progress_bar=None):
         # Ensure the DataFrame is not empty
         if df.empty:
+            return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
         df = df.sort_values('datetime')
+        # First, filter out news where the company isn't the main subject
+        relevance_results = []
+        for idx, row in df.iterrows():
+            is_main, score = self.is_company_main_subject(row['text'], row['company'])
+            if is_main:
+                relevance_results.append({
+                    'idx': idx,
+                    'relevance_score': score
+                })
+        if not relevance_results:
+            return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
+        relevant_indices = [r['idx'] for r in relevance_results]
+        relevance_scores = {r['idx']: r['relevance_score'] for r in relevance_results}
+        df_filtered = df.loc[relevant_indices].copy()
+        df_filtered['relevance_score'] = df_filtered.index.map(relevance_scores)
+        # Continue with clustering logic...
         clusters = []
         processed = set()
+        for i in tqdm(range(len(df_filtered)), total=len(df_filtered)):
             if i in processed:
                 continue
+            row1 = df_filtered.iloc[i]
+            cluster = [df_filtered.index[i]]
             processed.add(i)
+            if not pd.isna(row1['text']):
+                text1_embedding = self.encode_text(row1['text'])
+                if progress_bar:
+                    progress_bar.progress(len(processed) / len(df_filtered))
+                for j in range(len(df_filtered)):
+                    if j in processed:
+                        continue
+                    row2 = df_filtered.iloc[j]
+                    if pd.isna(row2['text']):
+                        continue
+                    time_diff = pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime'])
+                    if abs(time_diff.total_seconds() / 3600) > self.time_threshold:
+                        continue
+                    text2_embedding = self.encode_text(row2['text'])
+                    similarity = np.dot(text1_embedding, text2_embedding)
+                    if similarity >= self.similarity_threshold:
+                        cluster.append(df_filtered.index[j])
+                        processed.add(j)
+            clusters.append(cluster)
+        # Create result DataFrame
         result_data = []
+        for cluster_id, cluster_indices in enumerate(clusters, 1):
+            cluster_rows = df.loc[cluster_indices]
+            for idx in cluster_indices:
+                result_data.append({
+                    'cluster_id': cluster_id,
+                    'datetime': df.loc[idx, 'datetime'],
+                    'company': df.loc[idx, 'company'],
+                    'relevance_score': relevance_scores[idx],
+                    'text': df.loc[idx, 'text'],
+                    'cluster_size': len(cluster_indices)
+                })
         return pd.DataFrame(result_data)
 class NewsDeduplicator:
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
+    st.title("кластеризуем новости v.1.4")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])