Spaces:

linhkid91
/

ArxivDigest-extra

Running

App Files Files Community

KeyangXu commited on Jun 19, 2023

Commit

e0d30f3

unverified ·

2 Parent(s): 1912e89 5308684

Merge pull request #20 from AutoLLM/use-3.5-turbo-16k

Browse files

Files changed (3) hide show

README.md +1 -1
src/action.py +1 -1
src/relevancy.py +7 -8

README.md CHANGED Viewed

@@ -28,7 +28,7 @@ Staying up to date on [arXiv](https://arxiv.org) papers can take a considerable
 This repository offers a method to curate a daily digest, sorted by relevance, using large language models. These models are conditioned based on your personal research interests, which are described in natural language.
 * You modify the configuration file `config.yaml` with an arXiv Subject, some set of Categories, and a natural language statement about the type of papers you are interested in.
-* The code pulls all the abstracts for papers in those categories and ranks how relevant they are to your interest on a scale of 1-10 using `gpt-3.5-turbo`.
 * The code then emits an HTML digest listing all the relevant papers, and optionally emails it to you using [SendGrid](https://sendgrid.com). You will need to have a SendGrid account with an API key for this functionality to work.
 ### Testing it out with Hugging Face:

 This repository offers a method to curate a daily digest, sorted by relevance, using large language models. These models are conditioned based on your personal research interests, which are described in natural language.
 * You modify the configuration file `config.yaml` with an arXiv Subject, some set of Categories, and a natural language statement about the type of papers you are interested in.
+* The code pulls all the abstracts for papers in those categories and ranks how relevant they are to your interest on a scale of 1-10 using `gpt-3.5-turbo-16k`.
 * The code then emits an HTML digest listing all the relevant papers, and optionally emails it to you using [SendGrid](https://sendgrid.com). You will need to have a SendGrid account with an API key for this functionality to work.
 ### Testing it out with Hugging Face:

src/action.py CHANGED Viewed

@@ -92,7 +92,7 @@ def generate_body(topic, categories, interest, threshold):
             papers,
             query={"interest": interest},
             threshold_score=threshold,
-            num_paper_in_prompt=8)
         body = "<br><br>".join(
             [f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}'
              for paper in relevancy])

             papers,
             query={"interest": interest},
             threshold_score=threshold,
+            num_paper_in_prompt=16)
         body = "<br><br>".join(
             [f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}'
              for paper in relevancy])

src/relevancy.py CHANGED Viewed

@@ -2,7 +2,7 @@
 run:
 python -m relevancy run_all_day_paper \
   --output_dir ./data \
-  --model_name="gpt-3.5-turbo" \
 """
 import time
 import json
@@ -53,7 +53,7 @@ def post_process_chat_gpt_response(paper_data, response, threshold_score=8):
     scores = []
     for item in score_items:
         temp = item["Relevancy score"]
-        if "/" in temp:
             scores.append(int(temp.split("/")[0]))
         else:
             scores.append(int(temp))
@@ -72,7 +72,7 @@ def post_process_chat_gpt_response(paper_data, response, threshold_score=8):
         output_str += "Link: " + paper_data[idx]["main_page"] + "\n"
         for key, value in inst.items():
             paper_data[idx][key] = value
-            output_str += key + ": " + value + "\n"
         paper_data[idx]['summarized_text'] = output_str
         selected_data.append(paper_data[idx])
     return selected_data, hallucination
@@ -90,7 +90,7 @@ def process_subject_fields(subjects):
 def generate_relevance_score(
     all_papers,
     query,
-    model_name="gpt-3.5-turbo",
     threshold_score=8,
     num_paper_in_prompt=4,
     temperature=0.4,
@@ -108,7 +108,7 @@ def generate_relevance_score(
         decoding_args = utils.OpenAIDecodingArguments(
             temperature=temperature,
             n=1,
-            max_tokens=1072,  # hard-code to maximize the length. the requests will be automatically adjusted
             top_p=top_p,
         )
         request_start = time.time()
@@ -118,7 +118,6 @@ def generate_relevance_score(
             batch_size=1,
             decoding_args=decoding_args,
             logit_bias={"100257": -100},  # prevent the <|endoftext|> from being generated
-            # "100265":-100, "100276":-100 for <|im_end|> and <endofprompt> token
         )
         print ("response", response['message']['content'])
         request_duration = time.time() - request_start
@@ -132,7 +131,7 @@ def generate_relevance_score(
         print(f"Post-processing took {time.time() - process_start:.2f}s")
     if sorting:
-        ans_data = sorted(ans_data, key=lambda x: x["Relevancy score"], reverse=True)
     return ans_data, hallucination
@@ -140,7 +139,7 @@ def run_all_day_paper(
     query={"interest":"", "subjects":["Computation and Language", "Artificial Intelligence"]},
     date=None,
     data_dir="../data",
-    model_name="gpt-3.5-turbo",
     threshold_score=8,
     num_paper_in_prompt=8,
     temperature=0.4,

 run:
 python -m relevancy run_all_day_paper \
   --output_dir ./data \
+  --model_name="gpt-3.5-turbo-16k" \
 """
 import time
 import json
     scores = []
     for item in score_items:
         temp = item["Relevancy score"]
+        if isinstance(temp, str) and "/" in temp:
             scores.append(int(temp.split("/")[0]))
         else:
             scores.append(int(temp))
         output_str += "Link: " + paper_data[idx]["main_page"] + "\n"
         for key, value in inst.items():
             paper_data[idx][key] = value
+            output_str += str(key) + ": " + str(value) + "\n"
         paper_data[idx]['summarized_text'] = output_str
         selected_data.append(paper_data[idx])
     return selected_data, hallucination
 def generate_relevance_score(
     all_papers,
     query,
+    model_name="gpt-3.5-turbo-16k",
     threshold_score=8,
     num_paper_in_prompt=4,
     temperature=0.4,
         decoding_args = utils.OpenAIDecodingArguments(
             temperature=temperature,
             n=1,
+            max_tokens=128*num_paper_in_prompt, # The response for each paper should be less than 128 tokens.
             top_p=top_p,
         )
         request_start = time.time()
             batch_size=1,
             decoding_args=decoding_args,
             logit_bias={"100257": -100},  # prevent the <|endoftext|> from being generated
         )
         print ("response", response['message']['content'])
         request_duration = time.time() - request_start
         print(f"Post-processing took {time.time() - process_start:.2f}s")
     if sorting:
+        ans_data = sorted(ans_data, key=lambda x: int(x["Relevancy score"]), reverse=True)
     return ans_data, hallucination
     query={"interest":"", "subjects":["Computation and Language", "Artificial Intelligence"]},
     date=None,
     data_dir="../data",
+    model_name="gpt-3.5-turbo-16k",
     threshold_score=8,
     num_paper_in_prompt=8,
     temperature=0.4,