Spaces:

spacerini
/

code-search

Sleeping

cakiki commited on Aug 3, 2023

Commit

b6e279e

1 Parent(s): 546fd6f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ def page_0(query):
     query = tokenize_sentence(query)
     hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
     ix = [int(hit.docid) for hit in hits]
-    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
     results = format_results(results, untokenized_query)
     return results, [ix], gr.update(visible=True), untokenized_query
@@ -51,7 +51,8 @@ def page_i(i, ix, query):
 with gr.Blocks(css="#b {min-width:15px;background:transparent;}") as demo: #border:white;box-shadow:none;
     with gr.Row():
-        gr.Markdown(value="""## <p style="text-align: center;"> Code search </p>""")
     with gr.Row():
         with gr.Column(scale=1):
             result_list = gr.Dataframe(type="array", visible=False, col_count=1)

     query = tokenize_sentence(query)
     hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
     ix = [int(hit.docid) for hit in hits]
+    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True)
     results = format_results(results, untokenized_query)
     return results, [ix], gr.update(visible=True), untokenized_query
 with gr.Blocks(css="#b {min-width:15px;background:transparent;}") as demo: #border:white;box-shadow:none;
     with gr.Row():
+        gr.Markdown(value="""## <p style="text-align: center;"> Code search </p>""")
+        gr.Markdown("This search tool was used to validate tokenization scheme for code retrieval for the BigCode project. We indexed the [Santacoder](https://huggingface.co/bigcode/santacoder) training dataset and use a (2,4)-gram tokenizer to build the index. This is the same tokenization scheme that ended up being used to index the (StarCoder)[https://huggingface.co/spaces/bigcode/search] dataset.")
     with gr.Row():
         with gr.Column(scale=1):
             result_list = gr.Dataframe(type="array", visible=False, col_count=1)