Spaces:

Nikhil2904
/

mcqt

Configuration error

App Files Files Community

Nikhil2904 commited on Mar 14, 2024

Commit

600c297

verified ·

1 Parent(s): ea8262e

Upload 15 files

Browse files

Files changed (15) hide show

Dockerfile +24 -0
README.md +43 -12
callback.py +123 -0
generate_pdf.py +70 -0
openllm_chain.py +114 -0
parser_test.py +85 -0
pdf_to_quizz.py +28 -0
qa_llm.py +64 -0
qcm_chain.py +36 -0
quizz_generator.py +54 -0
requirements.txt +14 -0
text_to_quizz.py +8 -0
train_llm.csv +2 -0
ui.py +88 -0
ui_utils.py +45 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+# start by pulling the python image
+FROM python:3.9-slim
+WORKDIR /app
+# copy the requirements file into the image
+COPY ./requirements.txt /requirements.txt
+# switch working directory
+WORKDIR /
+ENV OPENAI_API_KEY=""
+EXPOSE 8501
+# install the dependencies and packages in the requirements file
+RUN pip3 install -r requirements.txt
+# copy every content from the local file to the image
+COPY ./ /
+# configure the container to run in an executed manner
+ENTRYPOINT [ "streamlit", "run" ]
+CMD [ "ui.py", "--server.headless", "true", "--server.fileWatcherType", "none", "--browser.gatherUsageStats", "false"]

README.md CHANGED Viewed

@@ -1,12 +1,43 @@
----
-title: Mcqt
-emoji: 🐨
-colorFrom: red
-colorTo: green
-sdk: streamlit
-sdk_version: 1.32.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# PDF to Quiz
+Upload a multiple page PDF and generate a quiz with multiple options. For each page 2 questions will be generated.
+This leverage Langchain library to abstract the LLM (Large Language Model) calls.
+The UI is based on Streamlit
+Here is an exemple PDF (sorry in french but you can get the idea...)
+![PDF sample](img/PDF-sample.png)
+Will generate the following interractive quiz questions:
+![PDF sample](img/quiz-reponse.png)
+## Pre-requisite
+You need a GPU to run the 13B model locally or you need to deploy it on HuggingFace by exemple (it's not free!)
+You can find [the model on HuggingFace](https://huggingface.co/fbellame/pdf_to_quizz_llama_13B)
+The [training  dataset is also available on HuggingFace](https://huggingface.co/datasets/fbellame/pdf_to_quizz_llama_13B)
+A video explaining the process is also [available](https://youtu.be/gXXkLVfiBVQ) (in french sorry)
+## Instructions
+To install:
+``` sh
+pip install -r requirements.txt
+```
+## Run
+To run:
+```sh
+streamlit run ui.py
+```

callback.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from langchain.callbacks.base import BaseCallbackHandler
+from pydantic import BaseModel
+from typing import Any, Dict, List, Union
+from langchain.schema import AgentAction, AgentFinish, LLMResult
+class BaseMyCallbackHandler(BaseModel):
+    """Base fake callback handler for testing."""
+    starts: int = 0
+    ends: int = 0
+    errors: int = 0
+    text: int = 0
+    ignore_llm_: bool = False
+    ignore_chain_: bool = False
+    ignore_agent_: bool = False
+    always_verbose_: bool = False
+    @property
+    def always_verbose(self) -> bool:
+        """Whether to call verbose callbacks even if verbose is False."""
+        return True
+    @property
+    def ignore_llm(self) -> bool:
+        """Whether to ignore LLM callbacks."""
+        return self.ignore_llm_
+    @property
+    def ignore_chain(self) -> bool:
+        """Whether to ignore chain callbacks."""
+        return self.ignore_chain_
+    @property
+    def ignore_agent(self) -> bool:
+        """Whether to ignore agent callbacks."""
+        return self.ignore_agent_
+    # add finer-grained counters for easier debugging of failing tests
+    chain_starts: int = 0
+    chain_ends: int = 0
+    llm_starts: int = 0
+    llm_ends: int = 0
+    llm_streams: int = 0
+    tool_starts: int = 0
+    tool_ends: int = 0
+    agent_ends: int = 0
+class MyCallbackHandler(BaseMyCallbackHandler, BaseCallbackHandler):
+    """Fake callback handler for testing."""
+    def on_llm_start(
+        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
+    ) -> None:
+        """Run when LLM starts running."""
+        self.llm_starts += 1
+        self.starts += 1
+        print(prompts[0])
+    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
+        """Run when LLM generates a new token."""
+        self.llm_streams += 1
+    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
+        """Run when LLM ends running."""
+        print(response)
+        self.llm_ends += 1
+        self.ends += 1
+    def on_llm_error(
+        self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
+    ) -> None:
+        """Run when LLM errors."""
+        self.errors += 1
+    def on_chain_start(
+        self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
+    ) -> None:
+        """Run when chain starts running."""
+        self.chain_starts += 1
+        self.starts += 1
+    def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None:
+        """Run when chain ends running."""
+        self.chain_ends += 1
+        self.ends += 1
+    def on_chain_error(
+        self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
+    ) -> None:
+        """Run when chain errors."""
+        self.errors += 1
+    def on_tool_start(
+        self, serialized: Dict[str, Any], input_str: str, **kwargs: Any
+    ) -> None:
+        """Run when tool starts running."""
+        self.tool_starts += 1
+        self.starts += 1
+    def on_tool_end(self, output: str, **kwargs: Any) -> None:
+        """Run when tool ends running."""
+        self.tool_ends += 1
+        self.ends += 1
+    def on_tool_error(
+        self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
+    ) -> None:
+        """Run when tool errors."""
+        self.errors += 1
+    def on_text(self, text: str, **kwargs: Any) -> None:
+        """Run when agent is ending."""
+        self.text += 1
+    def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None:
+        """Run when agent ends running."""
+        self.agent_ends += 1
+        self.ends += 1
+    def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any:
+        """Run on agent action."""
+        self.tool_starts += 1
+        self.starts += 1

generate_pdf.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import json
+from fpdf import FPDF
+class PDF(FPDF):
+    def header(self):
+        self.set_font("Arial", "B", 12)
+        self.cell(0, 10, "Questionnaire", align="C", ln=True)
+        self.cell(0, 10, "", ln=True)
+    def footer(self):
+        self.set_y(-15)
+        self.set_font("Arial", "I", 8)
+        page_number = f"Page {self.page_no()}"
+        self.cell(0, 10, page_number, align="C")
+def generate_questions(data, pdf: PDF, print_response:  bool = False):
+    pdf.add_page()
+    question_number = 1
+    # Add questions  to the PDF
+    for question_data in data:
+        question = question_data["question"]
+        options = [
+            f"A{question_data['A']}",
+            f"B{question_data['B']}",
+            f"C{question_data['C']}",
+            f"D{question_data['D']}"
+        ]
+        # Add question
+        pdf.multi_cell(0, 10, f"{question_number} . {question}")
+        # Add options
+        for option in options:
+            pdf.multi_cell(0, 10, option)
+        # Add response
+        response = "?"
+        if print_response:
+            response = question_data["reponse"]
+        pdf.cell(0, 10, f"Response: {response}", ln=True)
+        pdf.cell(0, 10, "", ln=True)
+        question_number += 1
+    pdf.add_page()
+def generate_pdf(filename , json_data):
+    # Create PDF document
+    pdf = PDF()
+    pdf.add_page()
+    # Set font style and size
+    pdf.set_font("Arial", size=10)
+    generate_questions(json_data, pdf, print_response=False)
+    generate_questions(json_data, pdf, print_response=True)
+    # Save PDF to a file
+    pdf.output(filename)
+def generate_pdf_quiz(file_name, json_data):
+    # remove extension .pdf from file name
+    if file_name.endswith(".json"):
+        file_name = file_name[:-5]
+    # Generate PDF
+    generate_pdf(f"{file_name}.pdf", json_data)

openllm_chain.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from typing import Any, Dict, List, Optional
+from langchain.base_language import BaseLanguageModel
+from langchain.callbacks.manager import (
+    CallbackManagerForChainRun,
+)
+from langchain.chains.base import Chain
+from langchain.prompts.base import BasePromptTemplate
+from langchain.output_parsers.regex import RegexParser
+class OpenLlamaChain(Chain):
+    prompt: BasePromptTemplate
+    llm: BaseLanguageModel
+    output_key: str = "text"
+    suffixes = ['</s>', 'User:', 'system:', 'Assistant:']
+    @property
+    def input_keys(self) -> List[str]:
+        return self.prompt.input_variables
+    @property
+    def output_keys(self) -> List[str]:
+        return [self.output_key]
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        # format the prompt
+        prompt_value = self.prompt.format_prompt(**inputs)
+        # generate response from llm
+        response = self.llm.generate_prompt(
+            [prompt_value],
+            callbacks=run_manager.get_child() if run_manager else None
+        )
+        # _______________
+        # here we add the removesuffix logic
+        for suffix in self.suffixes:
+            response.generations[0][0].text = response.generations[0][0].text.removesuffix(suffix)
+        return {self.output_key: response.generations[0][0].text.lstrip()}
+    def _call_batch(
+        self,
+        inputs: List[Dict[str, Any]],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> List[Dict[str, str]]:
+        prompts = []
+        for input in inputs:
+            prompts.append(self.prompt.format_prompt(**input))
+        # generate response from llm
+        response = self.llm.generate_prompt(
+            prompts,
+            callbacks=run_manager.get_child() if run_manager else None
+        )
+        quizzs = []
+        for generation in response.generations:
+            quizzs.append({self.output_key: generation[0].text.lstrip()})
+        return quizzs
+    async def _acall(
+        self, inputs: Dict[str, Any], run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        raise NotImplementedError("Async is not supported for this chain.")
+    @property
+    def _chain_type(self) -> str:
+        return "open_llama_pdf_to_quizz_chain"
+    def predict(self, doc: str) -> str:
+        out = self._call(inputs={'doc': doc})
+        return out
+    def predict_batch(self, docs: List[str], parsers) -> List[str]:
+        inputs = []
+        for doc in docs:
+            inputs.append({'doc': doc})
+        out = self._call_batch(inputs=inputs)
+        ret = []
+        for resp in out:
+            try:
+                ret.append(self.parse(resp, parsers))
+            except Exception as e:
+                print(f"Error processing page: {str(e)}")
+                continue
+        return ret
+    def predict_and_parse(self, doc: str, parsers) -> str:
+        out = self.predict(doc)
+        return self.parse(out, parsers)
+    def parse(self, response: Dict[str, Any], parsers):
+        def get_parsed_value(parser, key, doc):
+            result = parser.parse(doc["text"])
+            value = result.get(key).strip()
+            return {key: value}
+        quizz = {}
+        for key, parser in parsers.items():
+            quizz.update(get_parsed_value(parser, key, response))
+        return quizz

parser_test.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from langchain.output_parsers.regex import RegexParser
+def transform(input_list):
+    new_list = []
+    for key in input_list:
+        if 'question1' in key or 'question2' in key:
+            question_dict = {}
+            question_num = key[-1]
+            question_dict[f'question'] = input_list[key]
+            question_dict[f'A'] = input_list[f'A_{question_num}']
+            question_dict[f'B'] = input_list[f'B_{question_num}']
+            question_dict[f'C'] = input_list[f'C_{question_num}']
+            question_dict[f'D'] = input_list[f'D_{question_num}']
+            question_dict[f'reponse'] = input_list[f'reponse{question_num}']
+            new_list.append(question_dict)
+    return new_list
+# Define input string to parse
+#input_string = "Question 1: What is the conclusion of the study regarding the use of pretrained weights on 2D-Slice models with ResNet encoders initialized with ImageNet-1K pretrained weights for 3D Deep Neuroimaging?\nCHOIX_A: Pretrained weights consistently underperforms random initialization\nCHOIX_B: Pretrained weights consistently outperforms random initialization\nCHOIX_C: Pretrained weights have no effect on the performance of the models\nCHOIX_D: The study did not test the use of pretrained weights on 2D-Slice models\n\nRéponse: B\n\nQuestion 2: What is the main hypothesis that the study validates?\nCHOIX_A: Models trained on natural images (2D) cannot be helpful for neuroimaging tasks\nCHOIX_B: Models trained on natural images (2D) can be helpful for neuroimaging tasks\nCHOIX_C: 2D-Slice-CNNs cannot be used for neuroimaging tasks\nCHOIX_D: 2D-Slice-CNNs are the only models that can be used for neuroimaging tasks\n\nRéponse: B"
+# doc = '''question :      What was the reason for not asking for the LLM-based condition to show its work in the preliminary work on the paper?
+#  CHOICE_A:     The author thought it would increase the likelihood of transcribing the wrong answer.
+#  CHOICE_B:    The author wanted to avoid confusing the participant with a lot of numbers.
+#  CHOICE_C:    The author believed that precise probabilities had nothing to do with the problem.
+#  CHOICE_D:The author wanted to use a meta-prompt that didn't require determining precise probabilities.
+# reponse: B
+# '''
+doc = 'question: What is the purpose of the get_parsed_value function in the given document?\r\n CHOICE_A: To parse the value based on the given parser and document.\r\n CHOICE_B: To merge the parsed values into the quizz dictionary.\r\n CHOICE_C: To create a new dictionary called parsers.\r\n CHOICE_D: To define a new function called update method.\r\nreponse: A\r\n\r\r'
+parsers = {
+    "question": RegexParser(
+        #regex=r"question\s+:\s+\n?(.*?)(?:\n)+",
+        regex=r"question:\s*(.*?)\s+(?:\n)+",
+        output_keys=["question"]
+    ),
+    "A": RegexParser(
+        regex=r"(?:\n)+\s*CHOICE_A:(.*?)\n+",
+        output_keys=["A"]
+    ),
+    "B": RegexParser(
+        regex=r"(?:\n)+\s*CHOICE_B:(.*?)\n+",
+        output_keys=["B"]
+    ),
+    "C": RegexParser(
+        regex=r"(?:\n)+\s*CHOICE_C:(.*?)\n+",
+        output_keys=["C"]
+    ),
+    "D": RegexParser(
+        regex=r"(?:\n)+\s*CHOICE_D:(.*?)\n+",
+        output_keys=["D"]
+    ),
+    "reponse": RegexParser(
+        regex=r"(?:\n)+reponse:\s?(.*)",
+        output_keys=["reponse"]
+    )
+}
+def get_parsed_value(parser, key, doc):
+    result = parser.parse(doc)
+    value = result.get(key).strip()
+    return {key: value}
+quizz = {}
+for key, parser in parsers.items():
+    quizz.update(get_parsed_value(parser, key, doc))
+quizz_list = [quizz]
+output_parser = RegexParser(
+    regex=r"question\s?\d?:\s+\n?(.*?)\n\s*CHOICE_A(.*?)\n\s*CHOICE_B(.*?)\n\s*CHOICE_C(.*?)\n\s*CHOICE_D(.*?)(?:\n)+reponse:\s?(.*)",
+    output_keys=["question1", "A_1", "B_1", "C_1", "D_1", "reponse1"]
+)
+# Use the RegexParser to parse the input string
+output_dict = transform(output_parser.parse(doc))
+# Print the parsed output
+print(output_dict)

pdf_to_quizz.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from langchain.document_loaders import PyPDFLoader
+from quizz_generator import generate_quizz
+from langchain.text_splitter import NLTKTextSplitter
+import nltk
+from typing import List
+nltk.download('punkt')
+def pdf_to_quizz(pdf_file_name):
+    loader = PyPDFLoader(pdf_file_name)
+    docs = loader.load_and_split(NLTKTextSplitter(chunk_size=700, chunk_overlap=0))
+    paragraphs =list(map(lambda doc: doc.page_content.replace("\n", " ").strip(), docs))
+    i = 0
+    batch_paragraph : List[str] = []
+    for paragraph in paragraphs:
+        i+=1
+        if i<=10:
+            batch_paragraph.append(paragraph)
+        else:
+            break
+    return generate_quizz(batch_paragraph)
+# def process_paragraph(paragraph):
+#     return  generate_quizz(paragraph)

qa_llm.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from langchain.llms import HuggingFacePipeline
+import torch
+from torch import cuda
+from transformers import StoppingCriteria, StoppingCriteriaList
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# define custom stopping criteria object
+class StopOnTokens(StoppingCriteria):
+    def __init__(self, tokenizer):
+        device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
+        self.stop_token_ids = [
+            tokenizer.convert_tokens_to_ids(x) for x in [
+                ['</s>'], ['User', ':'], ['system', ':'],
+                [tokenizer.convert_ids_to_tokens([9427])[0], ':']
+            ]
+        ]
+        # We also need to convert these to `LongTensor` objects:
+        self.stop_token_ids = [torch.LongTensor(x).to(device) for x in self.stop_token_ids]
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        for stop_ids in self.stop_token_ids:
+            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
+                return True
+        return False
+class QaLlm():
+    def __init__(self) -> None:
+        device = 'cuda:0'
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+           'fbellame/pdf_to_quizz_llama_13B',
+           device_map={"": device},
+           load_in_4bit=True
+        )
+        tokenizer = transformers.AutoTokenizer.from_pretrained("fbellame/pdf_to_quizz_llama_13B", use_fast=False)
+        stopping_criteria = StoppingCriteriaList([StopOnTokens(tokenizer)])
+        generate_text = transformers.pipeline(
+            model=model, tokenizer=tokenizer,
+            return_full_text=True,  # langchain expects the full text
+            task='text-generation',
+            device_map={"": device},
+            # we pass model parameters here too
+            stopping_criteria=stopping_criteria,  # without this model will ramble
+            temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
+            top_p=0.15,  # select from top tokens whose probability add up to 15%
+            top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+            max_new_tokens=500,  # max number of tokens to generate in the output
+            repetition_penalty=1.2  # without this output begins repeating
+        )
+        self.llm = HuggingFacePipeline(pipeline=generate_text)
+    def get_llm(self):
+        return self.llm

qcm_chain.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""LLM Chain specifically for generating examples for QCM (Question Choix Multiples) answering."""
+from __future__ import annotations
+from typing import Any
+from openllm_chain import OpenLlamaChain
+from langchain.llms.base import BaseLLM
+from langchain.prompts import PromptTemplate
+#instruction = """You are a teacher preparing questions for a quiz. Given the following document, please generate 1 multiple-choice questions (MCQs) with 4 options and a corresponding answer letter based on the document. Example question:\nQuestion: question here\nCHOICE_A: choice here\nCHOICE_B: choice here\nCHOICE_C: choice here\nCHOICE_D: choice here\Answer: A or B or C or D\<Begin Document>\n{doc}\n<End Document>"""
+#template = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:"
+template = """<|prompt|>You are a teacher preparing questions for a quiz. Given the following document, please generate 1 multiple-choice questions (MCQs) with 4 options and a corresponding answer letter based on the document.
+    Example question:
+    Question: question here
+    CHOICE_A: choice here
+    CHOICE_B: choice here
+    CHOICE_C: choice here
+    CHOICE_D: choice here
+    Answer: A or B or C or D
+    <Begin Document>
+    {doc}
+    <End Document></s><|answer|>"""
+PROMPT = PromptTemplate(
+    input_variables=["doc"], template=template)
+class QCMGenerateChain(OpenLlamaChain):
+    """LLM Chain specifically for generating examples for QCM answering."""
+    @classmethod
+    def from_llm(cls, llm: BaseLLM, **kwargs: Any) -> QCMGenerateChain:
+        """Load QA Generate Chain from LLM."""
+        return cls(llm=llm, prompt=PROMPT, **kwargs)

quizz_generator.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from qcm_chain import QCMGenerateChain
+from qa_llm import QaLlm
+from langchain.output_parsers.regex import RegexParser
+from typing import List
+parsers = {
+    "question": RegexParser(
+        regex=r"question:\s*(.*?)\s+(?:\n)+",
+        output_keys=["question"]
+    ),
+    "A": RegexParser(
+        regex=r"(?:\n)+\s*CHOICE_A:(.*?)\n+",
+        output_keys=["A"]
+    ),
+    "B": RegexParser(
+        regex=r"(?:\n)+\s*CHOICE_B:(.*?)\n+",
+        output_keys=["B"]
+    ),
+    "C": RegexParser(
+        regex=r"(?:\n)+\s*CHOICE_C:(.*?)\n+",
+        output_keys=["C"]
+    ),
+    "D": RegexParser(
+        regex=r"(?:\n)+\s*CHOICE_D:(.*?)\n+",
+        output_keys=["D"]
+    ),
+    "reponse": RegexParser(
+        regex=r"(?:\n)+reponse:\s?(.*)",
+        output_keys=["reponse"]
+    )
+}
+qa_llm = QaLlm()
+qa_chain = QCMGenerateChain.from_llm(qa_llm.get_llm())
+def llm_call(qa_chain: QCMGenerateChain, texts: List[str]):
+    print(f"llm call running...")
+    batch_examples = qa_chain.predict_batch(texts, parsers)
+    print(f"llm call done.")
+    return batch_examples
+def generate_quizz(contents:List[str]):
+    """
+    Generates a quizz from the given content.
+    """
+    docs = []
+    for content in contents:
+        docs.append({"doc": content})
+    return llm_call(qa_chain, docs)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+langchain==0.0.212
+pypdf==3.7.0
+streamlit==1.21.0
+fpdf==1.7.2
+nltk==3.8.1
+beautifulsoup4==4.12.2
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+transformers==4.30.2
+accelerate==0.20.3
+xformers==0.0.20
+sentencepiece==0.1.99
+bitsandbytes==0.40.0

text_to_quizz.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from quizz_generator import generate_quizz
+def txt_to_quizz(content):
+    quizz = generate_quizz(content)
+    return [quizz]

train_llm.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ prompt,answer
2	+ "You are a teacher preparing questions for a quiz. Given the following document, please generate 1 multiple-choice questions (MCQs) with 4 options and a corresponding answer letter based on the document.\n\nExample question:\n\nQuestion: question here\nCHOICE_A: choice here\nCHOICE_B: choice here\nCHOICE_C: choice here\nCHOICE_D: choice here\nAnswer: A or B or C or D\n\nThese questions should be detailed and solely based on the information provided in the document.\n\n<Begin Document>\nAbstract\nLLM-powered chatbots are becoming widely\nadopted in applications such as healthcare, personal assistants, industry hiring decisions, etc.\nIn many of these cases, chatbots are fed sensitive, personal information in their prompts,\nas samples for in-context learning, retrieved\nrecords from a database or as part of the conversation. The information provided in the\nprompt could directly appear in the output,\nwhich might have privacy ramifications if there\nis sensitive information there. As such, in this\npaper, we aim to understand the input copying\nand regurgitation capabilities of these models during inference and how they can be directly instructed to limit this copying by complying with regulations such as HIPAA and\nGDPR, based on their internal knowledge of\nthem. More specifically, we find that when\nChatGPT is prompted to summarize cover letters of a 100 candidates, it would retain personally identifiable information (PII) verbatim in\n57.4% of cases, and we find this retention to\nbe non-uniform between different subgroups\nof people, based on attributes such as gender\nidentity. We then probe ChatGPT’s perception of privacy-related policies and privatization mechanisms by directly instructing it to\nprovide compliant outputs and observe a significant omission of PII from output.\n<End Document>",Question:\nWhat is one of the concerns mentioned in the document regarding the information provided in the prompts to chatbots?\nA) The use of sensitive information in healthcare applications\nB) The potential retention of personally identifiable information (PII) in the output\nC) The impact of gender identity on chatbot performance\nD) The need for chatbots to comply with regulations such as HIPAA and GDPR\n\nAnswer: B

ui.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import streamlit as st
+from ui_utils import check_password
+from pdf_to_quizz import pdf_to_quizz
+from text_to_quizz import txt_to_quizz
+from generate_pdf import generate_pdf_quiz
+import json
+import asyncio
+st.title("PDF to Quiz (:-)(-: )")
+def build_question(count, json_question):
+    if json_question.get(f"question") is not None:
+        st.write("Question: ", json_question.get(f"question", ""))
+        choices = ['A', 'B', 'C', 'D']
+        selected_answer = st.selectbox(f"Selectionnez votre réponse:", choices, key=f"select_{count}")
+        for choice in choices:
+            choice_str = json_question.get(f"{choice}", "None")
+            st.write(f"{choice} : {choice_str}")
+        color = ""
+        if st.button("Soumettre", key=f"button_{count}"):
+            rep = json_question.get(f"reponse")
+            if selected_answer in rep:
+                color = ":green"
+                st.write(f":green[Bonne réponse: {rep}]")
+            else:
+                color = ":red"
+                st.write(f":red[Mauvause réponse. La bonne réponse est {rep}].")
+        st.write(f"{color}[Votre réponse: {selected_answer}]")
+        count += 1
+    return count
+# Upload PDF file
+uploaded_file = st.file_uploader(":female-student:", type=["pdf"])
+txt = st.text_area('Taper le texte à partir duquel vous voulez générer le quizz')
+if st.button("Générer Quiz", key=f"button_generer"):
+    if txt is not None:
+        with st.spinner("Génération du quizz..."):
+            st.session_state['questions'] = txt_to_quizz(txt)
+            st.write("Quizz généré avec succès!")
+if uploaded_file is not None:
+    old_file_name = st.session_state.get('uploaded_file_name', None)
+    if (old_file_name != uploaded_file.name):
+        # Convert PDF to text
+        with st.spinner("Génération du quizz..."):
+            with open(f"data/{uploaded_file.name}", "wb") as f:
+                f.write(uploaded_file.getvalue())
+            # Initialize session state
+            st.session_state['uploaded_file_name'] = uploaded_file.name
+            st.session_state['questions'] = pdf_to_quizz(f"data/{uploaded_file.name}")
+            st.write("Quizz généré avec succès!")
+if ('questions' in st.session_state):
+    # Display question
+    count = 0
+    for json_question in st.session_state['questions']:
+        count = build_question(count, json_question)
+    # generate pdf quiz
+    if st.button("Générer PDF Quiz", key=f"button_generer_quiz"):
+        with st.spinner("Génération du quizz en PDF..."):
+            json_questions = st.session_state['questions']
+            # save into a file
+            file_name = uploaded_file.name
+            # remove extension .pdf from file name
+            if file_name.endswith(".pdf"):
+                file_name = file_name[:-4]
+            with open(f"data/quiz-{file_name}.json", "w", encoding='latin-1', errors='ignore') as f:
+                str = json.dumps(json_questions)
+                f.write(str)
+            generate_pdf_quiz(f"data/quiz-{file_name}.json", json_questions)
+            st.write("PDF Quiz généré avec succés!")

ui_utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import streamlit as st
+def check_password():
+    """Returns `True` if the user had the correct password."""
+    def password_entered():
+        """Checks whether a password entered by the user is correct."""
+        if st.session_state["password"] == st.secrets["password"]:
+            st.session_state["password_correct"] = True
+            del st.session_state["password"]  # don't store password
+        else:
+            st.session_state["password_correct"] = False
+    if "password_correct" not in st.session_state:
+        # First run, show input for password.
+        st.text_input(
+            "Password", type="password", on_change=password_entered, key="password"
+        )
+        return False
+    elif not st.session_state["password_correct"]:
+        # Password not correct, show input + error.
+        st.text_input(
+            "Password", type="password", on_change=password_entered, key="password"
+        )
+        st.error("😕 Password incorrect")
+        return False
+    else:
+        # Password correct.
+        return True
+def transform(input_list):
+    new_list = []
+    for item in input_list:
+        for key in item:
+            if 'question1' in key or 'question2' in key or 'question3' in key:
+                question_dict = {}
+                question_num = key[-1]
+                question_dict[f'question'] = item[key]
+                question_dict[f'A'] = item[f'A_{question_num}']
+                question_dict[f'B'] = item[f'B_{question_num}']
+                question_dict[f'C'] = item[f'C_{question_num}']
+                question_dict[f'D'] = item[f'D_{question_num}']
+                question_dict[f'reponse'] = item[f'reponse{question_num}']
+                new_list.append(question_dict)
+    return new_list