File size: 3,818 Bytes
ad38c8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import arxiv
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from cachetools import TTLCache, cached
from setfit import SetFitModel
from tqdm.auto import tqdm
CACHE_TIME = 60 * 60 * 12
MAX_RESULTS = 30_000
@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
def get_arxiv_result():
search = arxiv.Search(
query="ti:dataset AND abs:machine learning",
max_results=MAX_RESULTS,
sort_by=arxiv.SortCriterion.SubmittedDate,
)
return [
{
"title": result.title,
"abstract": result.summary,
"url": result.entry_id,
"category": result.primary_category,
"updated": result.updated,
}
for result in tqdm(search.results(), total=MAX_RESULTS)
]
def load_model():
return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model")
def format_row_for_model(row):
return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}"
int2label = {0: "new_dataset", 1: "not_new_dataset"}
def get_predictions(data: list[dict], model=None, batch_size=32):
if model is None:
model = load_model()
predictions = []
for i in tqdm(range(0, len(data), batch_size)):
batch = data[i : i + batch_size]
text_inputs = [format_row_for_model(row) for row in batch]
batch_predictions = model.predict_proba(text_inputs)
for j, row in enumerate(batch):
prediction = batch_predictions[j]
row["prediction"] = int2label[int(prediction.argmax())]
row["probability"] = float(prediction.max())
predictions.append(row)
return predictions
def create_markdown(row):
title = row["title"]
abstract = row["abstract"]
arxiv_id = row["arxiv_id"]
hub_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
updated = row["updated"]
updated = updated.strftime("%Y-%m-%d")
broad_category = row["broad_category"]
category = row["category"]
return f""" <h1> {title} </h1> updated: {updated}
| category: {broad_category} | subcategory: {category} |
\n\n{abstract}
\n\n [Hugging Face Papers page]({hub_paper_url})
"""
@cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME))
def prepare_data():
print("Downloading arxiv results...")
arxiv_results = get_arxiv_result()
print("loading model...")
model = load_model()
print("Making predictions...")
predictions = get_predictions(arxiv_results, model=model)
df = pd.DataFrame(predictions)
df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)")
df.loc[:, "broad_category"] = df["category"].str.split(".").str[0]
df.loc[:, "markdown"] = df.apply(create_markdown, axis=1)
return df
all_possible_arxiv_categories = prepare_data().category.unique().tolist()
broad_categories = prepare_data().broad_category.unique().tolist()
def create_markdown_summary(categories=broad_categories, all_categories=None):
df = prepare_data()
if categories is not None:
df = df[df["broad_category"].isin(categories)]
return "\n\n".join(df["markdown"].tolist())
scheduler = BackgroundScheduler()
scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
scheduler.start()
with gr.Blocks() as demo:
gr.Markdown("## New Datasets in Machine Learning")
gr.Markdown(
"This Space attempts to show new papers on arXiv that are *likely* to be papers"
" introducing new datasets. \n\n"
)
broad_categories = gr.Dropdown(
choices=broad_categories,
label="Categories",
multiselect=True,
value=broad_categories,
)
results = gr.Markdown(create_markdown_summary())
broad_categories.change(create_markdown_summary, broad_categories, results)
demo.launch()
|