kambris commited on
Commit
c671da9
·
verified ·
1 Parent(s): 631c46c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -7
app.py CHANGED
@@ -9,7 +9,7 @@ import numpy as np
9
  tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
10
  model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
11
 
12
- # Initialize AraBERT model and tokenizer
13
  bert_tokenizer = pipeline("feature-extraction", model="aubmindlab/bert-base-arabertv2")
14
 
15
  # Function to get embeddings from ARAT5 for topic modeling
@@ -17,7 +17,7 @@ def generate_embeddings(texts):
17
  embeddings = []
18
 
19
  for text in texts:
20
- # Tokenize the text
21
  tokens = bert_tokenizer.tokenizer.encode(text, truncation=False) # Get tokens without truncation
22
 
23
  # Split the tokens into chunks of size 512 (maximum length)
@@ -26,13 +26,10 @@ def generate_embeddings(texts):
26
  poem_embeddings = []
27
 
28
  for chunk in chunked_texts:
29
- # Decode the chunk back into text (optional but useful for debugging)
30
- chunk_text = bert_tokenizer.decode(chunk)
31
-
32
  # Process each chunk and get embeddings
33
- inputs = bert_tokenizer(chunk_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
34
  with torch.no_grad():
35
- outputs = bert_tokenizer(**inputs)
36
  chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
37
 
38
  poem_embeddings.append(chunk_embedding)
@@ -88,3 +85,4 @@ if uploaded_file is not None:
88
  st.write(result_df.head())
89
  except Exception as e:
90
  st.error(f"Error: {e}")
 
 
9
  tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
10
  model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
11
 
12
+ # Initialize AraBERT model and tokenizer for feature extraction
13
  bert_tokenizer = pipeline("feature-extraction", model="aubmindlab/bert-base-arabertv2")
14
 
15
  # Function to get embeddings from ARAT5 for topic modeling
 
17
  embeddings = []
18
 
19
  for text in texts:
20
+ # Tokenize the text (do not truncate)
21
  tokens = bert_tokenizer.tokenizer.encode(text, truncation=False) # Get tokens without truncation
22
 
23
  # Split the tokens into chunks of size 512 (maximum length)
 
26
  poem_embeddings = []
27
 
28
  for chunk in chunked_texts:
 
 
 
29
  # Process each chunk and get embeddings
30
+ inputs = bert_tokenizer.tokenizer(chunk, return_tensors="pt", padding=True, truncation=False, max_length=512)
31
  with torch.no_grad():
32
+ outputs = bert_tokenizer.model(**inputs)
33
  chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
34
 
35
  poem_embeddings.append(chunk_embedding)
 
85
  st.write(result_df.head())
86
  except Exception as e:
87
  st.error(f"Error: {e}")
88
+