Spaces:

kambris
/

SoLProject

Runtime error

kambris commited on Dec 6, 2024

Commit

9a7840e

verified ·

1 Parent(s): 52078cc

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -326,19 +326,25 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
-        # Get embeddings with proper output handling
         embeddings = []
         for i, text in enumerate(texts):
-            # Split text into chunks that respect the 512 token limit
-            text_chunks = [text[i:i+512] for i in range(0, len(text), 512)]
             chunk_embeddings = []
-            for chunk in text_chunks:
-                chunk_embedding = get_embedding_for_text(chunk, bert_tokenizer, bert_model)
                 chunk_embeddings.append(chunk_embedding)
-            # Combine chunk embeddings to represent the full poem
-            full_embedding = np.mean(chunk_embeddings, axis=0)
             embeddings.append(full_embedding)
             progress = (i + 1) / len(texts) * 0.4

         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
+        # Get embeddings while keeping all content
         embeddings = []
         for i, text in enumerate(texts):
+            # Tokenize the full text first
+            full_tokens = bert_tokenizer.tokenize(text)
             chunk_embeddings = []
+            # Create chunks of 510 tokens (leaving room for special tokens)
+            for start_idx in range(0, len(full_tokens), 510):
+                end_idx = start_idx + 510
+                chunk_tokens = full_tokens[start_idx:end_idx]
+                chunk_text = bert_tokenizer.convert_tokens_to_string(chunk_tokens)
+                # Get embedding for this chunk
+                chunk_embedding = get_embedding_for_text(chunk_text, bert_tokenizer, bert_model)
                 chunk_embeddings.append(chunk_embedding)
+            # Combine embeddings for full poem representation
+            full_embedding = np.mean(chunk_embeddings, axis=0) if chunk_embeddings else np.zeros(bert_model.config.hidden_size)
             embeddings.append(full_embedding)
             progress = (i + 1) / len(texts) * 0.4