kambris commited on
Commit
17deefc
·
verified ·
1 Parent(s): a6cdac2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -8
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
  import pandas as pd
3
- from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
4
  from bertopic import BERTopic
5
  import torch
6
  import numpy as np
@@ -9,16 +9,17 @@ import numpy as np
9
  tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
10
  model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
11
 
12
- # Initialize AraBERT model and tokenizer for feature extraction
13
- bert_tokenizer = pipeline("feature-extraction", model="aubmindlab/bert-base-arabertv2")
 
14
 
15
  # Function to get embeddings from ARAT5 for topic modeling
16
  def generate_embeddings(texts):
17
  embeddings = []
18
-
19
  for text in texts:
20
  # Tokenize the text with truncation set to False
21
- tokens = bert_tokenizer.tokenizer.encode(text, truncation=False) # Do not truncate here
22
 
23
  # Split the tokens into chunks of size 512 (maximum length)
24
  chunked_texts = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]
@@ -26,10 +27,11 @@ def generate_embeddings(texts):
26
  poem_embeddings = []
27
 
28
  for chunk in chunked_texts:
29
- # Process each chunk and get embeddings
30
- inputs = bert_tokenizer.tokenizer(chunk, return_tensors="pt", padding=True, truncation=False, max_length=512)
31
  with torch.no_grad():
32
- outputs = bert_tokenizer.model(**inputs)
 
33
  chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
34
 
35
  poem_embeddings.append(chunk_embedding)
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from transformers import T5Tokenizer, T5ForConditionalGeneration, BertTokenizer, BertModel
4
  from bertopic import BERTopic
5
  import torch
6
  import numpy as np
 
9
  tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
10
  model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
11
 
12
+ # Initialize BERT tokenizer and model for feature extraction
13
+ bert_tokenizer = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
14
+ bert_model = BertModel.from_pretrained("aubmindlab/bert-base-arabertv2")
15
 
16
  # Function to get embeddings from ARAT5 for topic modeling
17
  def generate_embeddings(texts):
18
  embeddings = []
19
+
20
  for text in texts:
21
  # Tokenize the text with truncation set to False
22
+ tokens = bert_tokenizer.encode(text, truncation=False) # Do not truncate here
23
 
24
  # Split the tokens into chunks of size 512 (maximum length)
25
  chunked_texts = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]
 
27
  poem_embeddings = []
28
 
29
  for chunk in chunked_texts:
30
+ # Prepare the input tensor for the model
31
+ inputs = torch.tensor(chunk).unsqueeze(0) # Adding batch dimension
32
  with torch.no_grad():
33
+ outputs = bert_model(inputs)
34
+ # Get the embeddings from the last hidden state
35
  chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
36
 
37
  poem_embeddings.append(chunk_embedding)