Spaces:

pentarosarium
/

clusters

Sleeping

App Files Files Community

pentarosarium commited on Nov 28, 2024

Commit

cbb8180

1 Parent(s): 6d4a64c

1.3

Browse files

Files changed (1) hide show

app.py +39 -2

app.py CHANGED Viewed

@@ -25,6 +25,44 @@ class NewsProcessor:
         self.similarity_threshold = similarity_threshold
         self.time_threshold = time_threshold
     def process_news(self, df: pd.DataFrame, progress_bar=None):
         # Ensure the DataFrame is not empty
         if df.empty:
@@ -55,7 +93,6 @@ class NewsProcessor:
                 progress_bar.progress(len(processed) / len(df))
                 progress_bar.text(f'Processing item {len(processed)}/{len(df)}...')
-            # Use index-based iteration instead of iterrows
             for j in range(len(df)):
                 if j in processed:
                     continue
@@ -171,7 +208,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
-    st.title("кластеризуем новости v.1.2")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])

         self.similarity_threshold = similarity_threshold
         self.time_threshold = time_threshold
+    def mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output[0]
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def encode_text(self, text):
+        # Convert text to string and handle NaN values
+        if pd.isna(text):
+            text = ""
+        else:
+            text = str(text)
+        encoded_input = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
+        return F.normalize(sentence_embeddings[0], p=2, dim=0).numpy()
+    def is_company_main_subject(self, text: str, companies: List[str]) -> Tuple[bool, str]:
+        if pd.isna(text):
+            return False, ""
+        text_lower = str(text).lower()
+        for company in companies:
+            company_lower = str(company).lower()
+            if company_lower in text_lower.split('.')[0]:
+                return True, company
+            if text_lower.count(company_lower) >= 3:
+                return True, company
+            doc = self.nlp(text_lower)
+            for sent in doc.sents:
+                if company_lower in sent.text:
+                    for token in sent:
+                        if token.dep_ == 'nsubj' and company_lower in token.text:
+                            return True, company
+        return False, ""
     def process_news(self, df: pd.DataFrame, progress_bar=None):
         # Ensure the DataFrame is not empty
         if df.empty:
                 progress_bar.progress(len(processed) / len(df))
                 progress_bar.text(f'Processing item {len(processed)}/{len(df)}...')
             for j in range(len(df)):
                 if j in processed:
                     continue
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
+    st.title("кластеризуем новости v.1.3+")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])