cakiki commited on
Commit
b6e279e
·
1 Parent(s): 546fd6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -39,7 +39,7 @@ def page_0(query):
39
  query = tokenize_sentence(query)
40
  hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
41
  ix = [int(hit.docid) for hit in hits]
42
- results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
43
  results = format_results(results, untokenized_query)
44
  return results, [ix], gr.update(visible=True), untokenized_query
45
 
@@ -51,7 +51,8 @@ def page_i(i, ix, query):
51
 
52
  with gr.Blocks(css="#b {min-width:15px;background:transparent;}") as demo: #border:white;box-shadow:none;
53
  with gr.Row():
54
- gr.Markdown(value="""## <p style="text-align: center;"> Code search </p>""")
 
55
  with gr.Row():
56
  with gr.Column(scale=1):
57
  result_list = gr.Dataframe(type="array", visible=False, col_count=1)
 
39
  query = tokenize_sentence(query)
40
  hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
41
  ix = [int(hit.docid) for hit in hits]
42
+ results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True)
43
  results = format_results(results, untokenized_query)
44
  return results, [ix], gr.update(visible=True), untokenized_query
45
 
 
51
 
52
  with gr.Blocks(css="#b {min-width:15px;background:transparent;}") as demo: #border:white;box-shadow:none;
53
  with gr.Row():
54
+ gr.Markdown(value="""## <p style="text-align: center;"> Code search </p>""")
55
+ gr.Markdown("This search tool was used to validate tokenization scheme for code retrieval for the BigCode project. We indexed the [Santacoder](https://huggingface.co/bigcode/santacoder) training dataset and use a (2,4)-gram tokenizer to build the index. This is the same tokenization scheme that ended up being used to index the (StarCoder)[https://huggingface.co/spaces/bigcode/search] dataset.")
56
  with gr.Row():
57
  with gr.Column(scale=1):
58
  result_list = gr.Dataframe(type="array", visible=False, col_count=1)