Spaces:

kambris
/

SoLProject

Runtime error

kambris commited on Nov 23, 2024

Commit

c671da9

verified ·

1 Parent(s): 631c46c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import numpy as np
 tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
 model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
-# Initialize AraBERT model and tokenizer
 bert_tokenizer = pipeline("feature-extraction", model="aubmindlab/bert-base-arabertv2")
 # Function to get embeddings from ARAT5 for topic modeling
@@ -17,7 +17,7 @@ def generate_embeddings(texts):
     embeddings = []
     for text in texts:
-        # Tokenize the text
         tokens = bert_tokenizer.tokenizer.encode(text, truncation=False)  # Get tokens without truncation
         # Split the tokens into chunks of size 512 (maximum length)
@@ -26,13 +26,10 @@ def generate_embeddings(texts):
         poem_embeddings = []
         for chunk in chunked_texts:
-            # Decode the chunk back into text (optional but useful for debugging)
-            chunk_text = bert_tokenizer.decode(chunk)
             # Process each chunk and get embeddings
-            inputs = bert_tokenizer(chunk_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
             with torch.no_grad():
-                outputs = bert_tokenizer(**inputs)
             chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
             poem_embeddings.append(chunk_embedding)
@@ -88,3 +85,4 @@ if uploaded_file is not None:
             st.write(result_df.head())
     except Exception as e:
         st.error(f"Error: {e}")

 tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
 model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
+# Initialize AraBERT model and tokenizer for feature extraction
 bert_tokenizer = pipeline("feature-extraction", model="aubmindlab/bert-base-arabertv2")
 # Function to get embeddings from ARAT5 for topic modeling
     embeddings = []
     for text in texts:
+        # Tokenize the text (do not truncate)
         tokens = bert_tokenizer.tokenizer.encode(text, truncation=False)  # Get tokens without truncation
         # Split the tokens into chunks of size 512 (maximum length)
         poem_embeddings = []
         for chunk in chunked_texts:
             # Process each chunk and get embeddings
+            inputs = bert_tokenizer.tokenizer(chunk, return_tensors="pt", padding=True, truncation=False, max_length=512)
             with torch.no_grad():
+                outputs = bert_tokenizer.model(**inputs)
             chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
             poem_embeddings.append(chunk_embedding)
             st.write(result_df.head())
     except Exception as e:
         st.error(f"Error: {e}")