Pringled commited on
Commit
d90d4c0
·
1 Parent(s): e49e0e9
Files changed (1) hide show
  1. app.py +11 -17
app.py CHANGED
@@ -4,8 +4,6 @@ import numpy as np
4
  from model2vec import StaticModel
5
  from reach import Reach
6
  from difflib import ndiff
7
- import tqdm
8
- from contextlib import contextmanager
9
 
10
  # Load the model at startup
11
  model = StaticModel.from_pretrained("minishlab/M2V_base_output")
@@ -27,19 +25,14 @@ def batch_iterable(iterable, batch_size):
27
  for i in range(0, len(iterable), batch_size):
28
  yield iterable[i:i + batch_size]
29
 
30
- @contextmanager
31
- def tqdm_redirect(progress):
32
- original_tqdm = tqdm.tqdm
33
- try:
34
- tqdm.tqdm = progress.tqdm
35
- yield
36
- finally:
37
- tqdm.tqdm = original_tqdm
38
-
39
  def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
40
- with tqdm_redirect(progress):
41
- embeddings = model.encode(texts, show_progressbar=True, batch_size=batch_size)
42
- return embeddings
 
 
 
 
43
 
44
  def deduplicate(
45
  embedding_matrix: np.ndarray,
@@ -299,7 +292,8 @@ def deduplicate_across_datasets(
299
 
300
  return duplicate_indices_in_test, duplicate_to_original_mapping
301
 
302
- with gr.Blocks() as demo:
 
303
  gr.Markdown("# Semantic Deduplication")
304
 
305
  deduplication_type = gr.Radio(
@@ -327,8 +321,8 @@ with gr.Blocks() as demo:
327
 
328
  compute_button = gr.Button("Compute")
329
 
330
- # Use 'lines' parameter to set the height
331
- status_output = gr.Textbox(lines=10, label="Status")
332
  result_output = gr.Markdown()
333
 
334
  # Function to update the visibility of dataset2_inputs
 
4
  from model2vec import StaticModel
5
  from reach import Reach
6
  from difflib import ndiff
 
 
7
 
8
  # Load the model at startup
9
  model = StaticModel.from_pretrained("minishlab/M2V_base_output")
 
25
  for i in range(0, len(iterable), batch_size):
26
  yield iterable[i:i + batch_size]
27
 
 
 
 
 
 
 
 
 
 
28
  def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
29
+ embeddings = []
30
+ total_batches = (len(texts) + batch_size - 1) // batch_size
31
+ for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
32
+ batch_embeddings = model.encode(batch_texts, show_progressbar=False)
33
+ embeddings.append(batch_embeddings)
34
+ progress((i + 1) / total_batches, desc=desc)
35
+ return np.concatenate(embeddings, axis=0)
36
 
37
  def deduplicate(
38
  embedding_matrix: np.ndarray,
 
292
 
293
  return duplicate_indices_in_test, duplicate_to_original_mapping
294
 
295
+ # Adjust the height of the status_output component using custom CSS
296
+ with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
297
  gr.Markdown("# Semantic Deduplication")
298
 
299
  deduplication_type = gr.Radio(
 
321
 
322
  compute_button = gr.Button("Compute")
323
 
324
+ # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
325
+ status_output = gr.Markdown(elem_id="status_output")
326
  result_output = gr.Markdown()
327
 
328
  # Function to update the visibility of dataset2_inputs