Tonic commited on
Commit
7b2544a
·
verified ·
1 Parent(s): 4cd22b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -0
app.py CHANGED
@@ -51,6 +51,11 @@ def clear_cuda_cache():
51
  def free_memory(*args):
52
  for arg in args:
53
  del arg
 
 
 
 
 
54
 
55
  # @spaces.GPU
56
  def compute_embeddings(selected_task, input_text):
@@ -110,7 +115,35 @@ def compute_cosine_similarity(emb1, emb2):
110
  free_memory(tensor1, tensor2)
111
  return similarity
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def app_interface():
 
 
114
  with gr.Blocks() as demo:
115
  gr.Markdown(title)
116
  gr.Markdown(description)
@@ -139,6 +172,43 @@ def app_interface():
139
  inputs=[task_dropdown, sentence1_box, sentence2_box, extra_sentence1_box, extra_sentence2_box],
140
  outputs=similarity_output
141
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  with gr.Row():
144
  with gr.Column():
 
51
  def free_memory(*args):
52
  for arg in args:
53
  del arg
54
+
55
+ def load_corpus_from_json(file_path):
56
+ with open(file_path, 'r') as file:
57
+ data = json.load(file)
58
+ return data
59
 
60
  # @spaces.GPU
61
  def compute_embeddings(selected_task, input_text):
 
115
  free_memory(tensor1, tensor2)
116
  return similarity
117
 
118
+
119
+ def compute_embeddings_batch(input_texts):
120
+ max_length = 2042
121
+ processed_texts = [f'Instruct: {task_description}\nQuery: {text}' for text in input_texts]
122
+
123
+ batch_dict = tokenizer(processed_texts, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
124
+ batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
125
+ batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
126
+ batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
127
+ outputs = model(**batch_dict)
128
+ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
129
+ embeddings = F.normalize(embeddings, p=2, dim=1)
130
+ return embeddings.detach().cpu().numpy()
131
+
132
+ def semantic_search(query_embedding, corpus_embeddings, top_k=5):
133
+ scores = np.dot(corpus_embeddings, query_embedding.T).flatten()
134
+ top_k_indices = np.argsort(scores)[::-1][:top_k]
135
+ return top_k_indices, scores[top_k_indices]
136
+
137
+ def search_similar_sentences(input_question, corpus_sentences, corpus_embeddings):
138
+ question_embedding = compute_embeddings_batch([input_question])[0]
139
+ top_k_indices, top_k_scores = semantic_search(question_embedding, corpus_embeddings)
140
+ results = [(corpus_sentences[i], top_k_scores[i]) for i in top_k_indices]
141
+ return results
142
+
143
+
144
  def app_interface():
145
+ corpus_sentences = []
146
+ corpus_embeddings = []
147
  with gr.Blocks() as demo:
148
  gr.Markdown(title)
149
  gr.Markdown(description)
 
172
  inputs=[task_dropdown, sentence1_box, sentence2_box, extra_sentence1_box, extra_sentence2_box],
173
  outputs=similarity_output
174
  )
175
+ with gr.Tab("Load Corpus"):
176
+ json_uploader = gr.File(label="Upload JSON File")
177
+ load_corpus_button = gr.Button("Load Corpus")
178
+ corpus_status = gr.Textbox(label="Corpus Status", value="Corpus not loaded", readonly=True)
179
+
180
+ def load_corpus(file_info):
181
+ if file_info is None:
182
+ return "No file uploaded. Please upload a JSON file."
183
+ try:
184
+ global corpus_sentences, corpus_embeddings
185
+ corpus_sentences = load_corpus_from_json(file_info['name'])
186
+ corpus_embeddings = compute_embeddings_batch(corpus_sentences)
187
+ return "Corpus loaded successfully with {} sentences.".format(len(corpus_sentences))
188
+ except Exception as e:
189
+ return "Error loading corpus: {}".format(e)
190
+
191
+ load_corpus_button.click(
192
+ fn=load_corpus,
193
+ inputs=json_uploader,
194
+ outputs=corpus_status
195
+ )
196
+
197
+ with gr.Tab("Semantic Search"):
198
+ input_question_box = gr.Textbox(label="Enter your question")
199
+ search_button = gr.Button("Search")
200
+ search_results_output = gr.Textbox(label="Search Results")
201
+
202
+ def perform_search(input_question):
203
+ if not corpus_sentences or not corpus_embeddings:
204
+ return "Corpus is not loaded. Please load a corpus first."
205
+ return search_similar_sentences(input_question, corpus_sentences, corpus_embeddings)
206
+
207
+ search_button.click(
208
+ fn=perform_search,
209
+ inputs=input_question_box,
210
+ outputs=search_results_output
211
+ )
212
 
213
  with gr.Row():
214
  with gr.Column():