Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
from transformers import T5Tokenizer, T5ForConditionalGeneration,
|
4 |
from bertopic import BERTopic
|
5 |
import torch
|
6 |
import numpy as np
|
@@ -9,16 +9,17 @@ import numpy as np
|
|
9 |
tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
|
10 |
model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
|
11 |
|
12 |
-
# Initialize
|
13 |
-
bert_tokenizer =
|
|
|
14 |
|
15 |
# Function to get embeddings from ARAT5 for topic modeling
|
16 |
def generate_embeddings(texts):
|
17 |
embeddings = []
|
18 |
-
|
19 |
for text in texts:
|
20 |
# Tokenize the text with truncation set to False
|
21 |
-
tokens = bert_tokenizer.
|
22 |
|
23 |
# Split the tokens into chunks of size 512 (maximum length)
|
24 |
chunked_texts = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]
|
@@ -26,10 +27,11 @@ def generate_embeddings(texts):
|
|
26 |
poem_embeddings = []
|
27 |
|
28 |
for chunk in chunked_texts:
|
29 |
-
#
|
30 |
-
inputs =
|
31 |
with torch.no_grad():
|
32 |
-
outputs =
|
|
|
33 |
chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
|
34 |
|
35 |
poem_embeddings.append(chunk_embedding)
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration, BertTokenizer, BertModel
|
4 |
from bertopic import BERTopic
|
5 |
import torch
|
6 |
import numpy as np
|
|
|
9 |
tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
|
10 |
model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
|
11 |
|
12 |
+
# Initialize BERT tokenizer and model for feature extraction
|
13 |
+
bert_tokenizer = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
|
14 |
+
bert_model = BertModel.from_pretrained("aubmindlab/bert-base-arabertv2")
|
15 |
|
16 |
# Function to get embeddings from ARAT5 for topic modeling
|
17 |
def generate_embeddings(texts):
|
18 |
embeddings = []
|
19 |
+
|
20 |
for text in texts:
|
21 |
# Tokenize the text with truncation set to False
|
22 |
+
tokens = bert_tokenizer.encode(text, truncation=False) # Do not truncate here
|
23 |
|
24 |
# Split the tokens into chunks of size 512 (maximum length)
|
25 |
chunked_texts = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]
|
|
|
27 |
poem_embeddings = []
|
28 |
|
29 |
for chunk in chunked_texts:
|
30 |
+
# Prepare the input tensor for the model
|
31 |
+
inputs = torch.tensor(chunk).unsqueeze(0) # Adding batch dimension
|
32 |
with torch.no_grad():
|
33 |
+
outputs = bert_model(inputs)
|
34 |
+
# Get the embeddings from the last hidden state
|
35 |
chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
|
36 |
|
37 |
poem_embeddings.append(chunk_embedding)
|