Spaces:

kambris
/

SoLProject

Runtime error

kambris commited on Dec 6, 2024

Commit

b449fa6

verified ·

1 Parent(s): 9a7840e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -235,30 +235,38 @@ def classify_emotion(text, classifier):
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
-    chunks = split_text(text)
     chunk_embeddings = []
-    for chunk in chunks:
-        inputs = tokenizer(
-            chunk,
-            return_tensors="pt",
-            padding=True,
             truncation=True,
-            max_length=512
         )
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
         with torch.no_grad():
-            # Access the first element of the tuple directly
-            outputs = model(**inputs)
-            embedding = outputs[0][:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
     if chunk_embeddings:
-        weights = np.array([len(chunk.split()) for chunk in chunks])
-        weights = weights / weights.sum()
-        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
-        return weighted_embedding
     return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):

 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
+    # First tokenize to get exact count
+    tokens = tokenizer.tokenize(text)
+    # Process in chunks of exactly 510 tokens (512 - 2 for CLS and SEP)
+    chunk_size = 510
     chunk_embeddings = []
+    for i in range(0, len(tokens), chunk_size):
+        chunk = tokens[i:i + chunk_size]
+        # Convert tokens back to text
+        chunk_text = tokenizer.convert_tokens_to_string(chunk)
+        # Now encode with special tokens
+        encoded = tokenizer(
+            chunk_text,
+            return_tensors='pt',
+            max_length=512,
             truncation=True,
+            padding='max_length'
         )
+        # Move to device
+        encoded = {k: v.to(model.device) for k, v in encoded.items()}
+        # Get embedding
         with torch.no_grad():
+            output = model(**encoded)
+            embedding = output[0][:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
+    # Combine all chunk embeddings
     if chunk_embeddings:
+        return np.mean(chunk_embeddings, axis=0)
     return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):