kambris commited on
Commit
9a7840e
·
verified ·
1 Parent(s): 52078cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -7
app.py CHANGED
@@ -326,19 +326,25 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
326
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
327
  all_emotions = []
328
 
329
- # Get embeddings with proper output handling
330
  embeddings = []
331
  for i, text in enumerate(texts):
332
- # Split text into chunks that respect the 512 token limit
333
- text_chunks = [text[i:i+512] for i in range(0, len(text), 512)]
334
  chunk_embeddings = []
335
 
336
- for chunk in text_chunks:
337
- chunk_embedding = get_embedding_for_text(chunk, bert_tokenizer, bert_model)
 
 
 
 
 
 
338
  chunk_embeddings.append(chunk_embedding)
339
 
340
- # Combine chunk embeddings to represent the full poem
341
- full_embedding = np.mean(chunk_embeddings, axis=0)
342
  embeddings.append(full_embedding)
343
 
344
  progress = (i + 1) / len(texts) * 0.4
 
326
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
327
  all_emotions = []
328
 
329
+ # Get embeddings while keeping all content
330
  embeddings = []
331
  for i, text in enumerate(texts):
332
+ # Tokenize the full text first
333
+ full_tokens = bert_tokenizer.tokenize(text)
334
  chunk_embeddings = []
335
 
336
+ # Create chunks of 510 tokens (leaving room for special tokens)
337
+ for start_idx in range(0, len(full_tokens), 510):
338
+ end_idx = start_idx + 510
339
+ chunk_tokens = full_tokens[start_idx:end_idx]
340
+ chunk_text = bert_tokenizer.convert_tokens_to_string(chunk_tokens)
341
+
342
+ # Get embedding for this chunk
343
+ chunk_embedding = get_embedding_for_text(chunk_text, bert_tokenizer, bert_model)
344
  chunk_embeddings.append(chunk_embedding)
345
 
346
+ # Combine embeddings for full poem representation
347
+ full_embedding = np.mean(chunk_embeddings, axis=0) if chunk_embeddings else np.zeros(bert_model.config.hidden_size)
348
  embeddings.append(full_embedding)
349
 
350
  progress = (i + 1) / len(texts) * 0.4