Nikhil2904 commited on
Commit
600c297
·
verified ·
1 Parent(s): ea8262e

Upload 15 files

Browse files
Files changed (15) hide show
  1. Dockerfile +24 -0
  2. README.md +43 -12
  3. callback.py +123 -0
  4. generate_pdf.py +70 -0
  5. openllm_chain.py +114 -0
  6. parser_test.py +85 -0
  7. pdf_to_quizz.py +28 -0
  8. qa_llm.py +64 -0
  9. qcm_chain.py +36 -0
  10. quizz_generator.py +54 -0
  11. requirements.txt +14 -0
  12. text_to_quizz.py +8 -0
  13. train_llm.csv +2 -0
  14. ui.py +88 -0
  15. ui_utils.py +45 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # start by pulling the python image
2
+ FROM python:3.9-slim
3
+
4
+ WORKDIR /app
5
+
6
+ # copy the requirements file into the image
7
+ COPY ./requirements.txt /requirements.txt
8
+
9
+ # switch working directory
10
+ WORKDIR /
11
+
12
+ ENV OPENAI_API_KEY=""
13
+
14
+ EXPOSE 8501
15
+
16
+ # install the dependencies and packages in the requirements file
17
+ RUN pip3 install -r requirements.txt
18
+
19
+ # copy every content from the local file to the image
20
+ COPY ./ /
21
+
22
+ # configure the container to run in an executed manner
23
+ ENTRYPOINT [ "streamlit", "run" ]
24
+ CMD [ "ui.py", "--server.headless", "true", "--server.fileWatcherType", "none", "--browser.gatherUsageStats", "false"]
README.md CHANGED
@@ -1,12 +1,43 @@
1
- ---
2
- title: Mcqt
3
- emoji: 🐨
4
- colorFrom: red
5
- colorTo: green
6
- sdk: streamlit
7
- sdk_version: 1.32.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF to Quiz
2
+
3
+ Upload a multiple page PDF and generate a quiz with multiple options. For each page 2 questions will be generated.
4
+
5
+ This leverage Langchain library to abstract the LLM (Large Language Model) calls.
6
+
7
+ The UI is based on Streamlit
8
+
9
+ Here is an exemple PDF (sorry in french but you can get the idea...)
10
+
11
+ ![PDF sample](img/PDF-sample.png)
12
+
13
+ Will generate the following interractive quiz questions:
14
+
15
+ ![PDF sample](img/quiz-reponse.png)
16
+
17
+
18
+ ## Pre-requisite
19
+
20
+ You need a GPU to run the 13B model locally or you need to deploy it on HuggingFace by exemple (it's not free!)
21
+
22
+ You can find [the model on HuggingFace](https://huggingface.co/fbellame/pdf_to_quizz_llama_13B)
23
+
24
+ The [training dataset is also available on HuggingFace](https://huggingface.co/datasets/fbellame/pdf_to_quizz_llama_13B)
25
+
26
+ A video explaining the process is also [available](https://youtu.be/gXXkLVfiBVQ) (in french sorry)
27
+
28
+ ## Instructions
29
+
30
+
31
+ To install:
32
+ ``` sh
33
+ pip install -r requirements.txt
34
+ ```
35
+
36
+ ## Run
37
+
38
+
39
+ To run:
40
+ ```sh
41
+ streamlit run ui.py
42
+ ```
43
+
callback.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.callbacks.base import BaseCallbackHandler
2
+ from pydantic import BaseModel
3
+ from typing import Any, Dict, List, Union
4
+ from langchain.schema import AgentAction, AgentFinish, LLMResult
5
+
6
+ class BaseMyCallbackHandler(BaseModel):
7
+ """Base fake callback handler for testing."""
8
+
9
+ starts: int = 0
10
+ ends: int = 0
11
+ errors: int = 0
12
+ text: int = 0
13
+ ignore_llm_: bool = False
14
+ ignore_chain_: bool = False
15
+ ignore_agent_: bool = False
16
+ always_verbose_: bool = False
17
+
18
+ @property
19
+ def always_verbose(self) -> bool:
20
+ """Whether to call verbose callbacks even if verbose is False."""
21
+ return True
22
+
23
+ @property
24
+ def ignore_llm(self) -> bool:
25
+ """Whether to ignore LLM callbacks."""
26
+ return self.ignore_llm_
27
+
28
+ @property
29
+ def ignore_chain(self) -> bool:
30
+ """Whether to ignore chain callbacks."""
31
+ return self.ignore_chain_
32
+
33
+ @property
34
+ def ignore_agent(self) -> bool:
35
+ """Whether to ignore agent callbacks."""
36
+ return self.ignore_agent_
37
+
38
+ # add finer-grained counters for easier debugging of failing tests
39
+ chain_starts: int = 0
40
+ chain_ends: int = 0
41
+ llm_starts: int = 0
42
+ llm_ends: int = 0
43
+ llm_streams: int = 0
44
+ tool_starts: int = 0
45
+ tool_ends: int = 0
46
+ agent_ends: int = 0
47
+
48
+ class MyCallbackHandler(BaseMyCallbackHandler, BaseCallbackHandler):
49
+ """Fake callback handler for testing."""
50
+
51
+ def on_llm_start(
52
+ self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
53
+ ) -> None:
54
+ """Run when LLM starts running."""
55
+ self.llm_starts += 1
56
+ self.starts += 1
57
+ print(prompts[0])
58
+
59
+ def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
60
+ """Run when LLM generates a new token."""
61
+ self.llm_streams += 1
62
+
63
+ def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
64
+ """Run when LLM ends running."""
65
+ print(response)
66
+ self.llm_ends += 1
67
+ self.ends += 1
68
+
69
+ def on_llm_error(
70
+ self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
71
+ ) -> None:
72
+ """Run when LLM errors."""
73
+ self.errors += 1
74
+
75
+ def on_chain_start(
76
+ self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
77
+ ) -> None:
78
+ """Run when chain starts running."""
79
+ self.chain_starts += 1
80
+ self.starts += 1
81
+
82
+ def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None:
83
+ """Run when chain ends running."""
84
+ self.chain_ends += 1
85
+ self.ends += 1
86
+
87
+ def on_chain_error(
88
+ self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
89
+ ) -> None:
90
+ """Run when chain errors."""
91
+ self.errors += 1
92
+
93
+ def on_tool_start(
94
+ self, serialized: Dict[str, Any], input_str: str, **kwargs: Any
95
+ ) -> None:
96
+ """Run when tool starts running."""
97
+ self.tool_starts += 1
98
+ self.starts += 1
99
+
100
+ def on_tool_end(self, output: str, **kwargs: Any) -> None:
101
+ """Run when tool ends running."""
102
+ self.tool_ends += 1
103
+ self.ends += 1
104
+
105
+ def on_tool_error(
106
+ self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
107
+ ) -> None:
108
+ """Run when tool errors."""
109
+ self.errors += 1
110
+
111
+ def on_text(self, text: str, **kwargs: Any) -> None:
112
+ """Run when agent is ending."""
113
+ self.text += 1
114
+
115
+ def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None:
116
+ """Run when agent ends running."""
117
+ self.agent_ends += 1
118
+ self.ends += 1
119
+
120
+ def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any:
121
+ """Run on agent action."""
122
+ self.tool_starts += 1
123
+ self.starts += 1
generate_pdf.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from fpdf import FPDF
3
+
4
+ class PDF(FPDF):
5
+ def header(self):
6
+ self.set_font("Arial", "B", 12)
7
+ self.cell(0, 10, "Questionnaire", align="C", ln=True)
8
+ self.cell(0, 10, "", ln=True)
9
+
10
+ def footer(self):
11
+ self.set_y(-15)
12
+ self.set_font("Arial", "I", 8)
13
+ page_number = f"Page {self.page_no()}"
14
+ self.cell(0, 10, page_number, align="C")
15
+
16
+ def generate_questions(data, pdf: PDF, print_response: bool = False):
17
+ pdf.add_page()
18
+
19
+ question_number = 1
20
+ # Add questions to the PDF
21
+ for question_data in data:
22
+ question = question_data["question"]
23
+ options = [
24
+ f"A{question_data['A']}",
25
+ f"B{question_data['B']}",
26
+ f"C{question_data['C']}",
27
+ f"D{question_data['D']}"
28
+ ]
29
+
30
+ # Add question
31
+ pdf.multi_cell(0, 10, f"{question_number} . {question}")
32
+
33
+ # Add options
34
+ for option in options:
35
+ pdf.multi_cell(0, 10, option)
36
+
37
+ # Add response
38
+ response = "?"
39
+ if print_response:
40
+ response = question_data["reponse"]
41
+ pdf.cell(0, 10, f"Response: {response}", ln=True)
42
+ pdf.cell(0, 10, "", ln=True)
43
+ question_number += 1
44
+
45
+ pdf.add_page()
46
+
47
+ def generate_pdf(filename , json_data):
48
+
49
+ # Create PDF document
50
+ pdf = PDF()
51
+ pdf.add_page()
52
+
53
+ # Set font style and size
54
+ pdf.set_font("Arial", size=10)
55
+
56
+ generate_questions(json_data, pdf, print_response=False)
57
+ generate_questions(json_data, pdf, print_response=True)
58
+
59
+ # Save PDF to a file
60
+ pdf.output(filename)
61
+
62
+ def generate_pdf_quiz(file_name, json_data):
63
+
64
+ # remove extension .pdf from file name
65
+ if file_name.endswith(".json"):
66
+ file_name = file_name[:-5]
67
+
68
+ # Generate PDF
69
+ generate_pdf(f"{file_name}.pdf", json_data)
70
+
openllm_chain.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from langchain.base_language import BaseLanguageModel
4
+ from langchain.callbacks.manager import (
5
+ CallbackManagerForChainRun,
6
+ )
7
+ from langchain.chains.base import Chain
8
+ from langchain.prompts.base import BasePromptTemplate
9
+ from langchain.output_parsers.regex import RegexParser
10
+
11
+ class OpenLlamaChain(Chain):
12
+ prompt: BasePromptTemplate
13
+ llm: BaseLanguageModel
14
+ output_key: str = "text"
15
+ suffixes = ['</s>', 'User:', 'system:', 'Assistant:']
16
+
17
+ @property
18
+ def input_keys(self) -> List[str]:
19
+ return self.prompt.input_variables
20
+
21
+ @property
22
+ def output_keys(self) -> List[str]:
23
+ return [self.output_key]
24
+
25
+ def _call(
26
+ self,
27
+ inputs: Dict[str, Any],
28
+ run_manager: Optional[CallbackManagerForChainRun] = None,
29
+ ) -> Dict[str, str]:
30
+ # format the prompt
31
+ prompt_value = self.prompt.format_prompt(**inputs)
32
+ # generate response from llm
33
+ response = self.llm.generate_prompt(
34
+ [prompt_value],
35
+ callbacks=run_manager.get_child() if run_manager else None
36
+ )
37
+ # _______________
38
+ # here we add the removesuffix logic
39
+ for suffix in self.suffixes:
40
+ response.generations[0][0].text = response.generations[0][0].text.removesuffix(suffix)
41
+
42
+ return {self.output_key: response.generations[0][0].text.lstrip()}
43
+
44
+ def _call_batch(
45
+ self,
46
+ inputs: List[Dict[str, Any]],
47
+ run_manager: Optional[CallbackManagerForChainRun] = None,
48
+ ) -> List[Dict[str, str]]:
49
+
50
+ prompts = []
51
+ for input in inputs:
52
+ prompts.append(self.prompt.format_prompt(**input))
53
+
54
+ # generate response from llm
55
+ response = self.llm.generate_prompt(
56
+ prompts,
57
+ callbacks=run_manager.get_child() if run_manager else None
58
+ )
59
+
60
+ quizzs = []
61
+ for generation in response.generations:
62
+ quizzs.append({self.output_key: generation[0].text.lstrip()})
63
+
64
+ return quizzs
65
+
66
+ async def _acall(
67
+ self, inputs: Dict[str, Any], run_manager: Optional[CallbackManagerForChainRun] = None,
68
+ ) -> Dict[str, str]:
69
+ raise NotImplementedError("Async is not supported for this chain.")
70
+
71
+ @property
72
+ def _chain_type(self) -> str:
73
+ return "open_llama_pdf_to_quizz_chain"
74
+
75
+ def predict(self, doc: str) -> str:
76
+
77
+ out = self._call(inputs={'doc': doc})
78
+ return out
79
+
80
+ def predict_batch(self, docs: List[str], parsers) -> List[str]:
81
+
82
+ inputs = []
83
+ for doc in docs:
84
+ inputs.append({'doc': doc})
85
+
86
+ out = self._call_batch(inputs=inputs)
87
+
88
+ ret = []
89
+ for resp in out:
90
+ try:
91
+ ret.append(self.parse(resp, parsers))
92
+ except Exception as e:
93
+ print(f"Error processing page: {str(e)}")
94
+ continue
95
+
96
+ return ret
97
+
98
+ def predict_and_parse(self, doc: str, parsers) -> str:
99
+ out = self.predict(doc)
100
+
101
+ return self.parse(out, parsers)
102
+
103
+ def parse(self, response: Dict[str, Any], parsers):
104
+
105
+ def get_parsed_value(parser, key, doc):
106
+ result = parser.parse(doc["text"])
107
+ value = result.get(key).strip()
108
+ return {key: value}
109
+
110
+ quizz = {}
111
+ for key, parser in parsers.items():
112
+ quizz.update(get_parsed_value(parser, key, response))
113
+
114
+ return quizz
parser_test.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.output_parsers.regex import RegexParser
2
+
3
+ def transform(input_list):
4
+ new_list = []
5
+ for key in input_list:
6
+ if 'question1' in key or 'question2' in key:
7
+ question_dict = {}
8
+ question_num = key[-1]
9
+ question_dict[f'question'] = input_list[key]
10
+ question_dict[f'A'] = input_list[f'A_{question_num}']
11
+ question_dict[f'B'] = input_list[f'B_{question_num}']
12
+ question_dict[f'C'] = input_list[f'C_{question_num}']
13
+ question_dict[f'D'] = input_list[f'D_{question_num}']
14
+ question_dict[f'reponse'] = input_list[f'reponse{question_num}']
15
+ new_list.append(question_dict)
16
+ return new_list
17
+
18
+ # Define input string to parse
19
+ #input_string = "Question 1: What is the conclusion of the study regarding the use of pretrained weights on 2D-Slice models with ResNet encoders initialized with ImageNet-1K pretrained weights for 3D Deep Neuroimaging?\nCHOIX_A: Pretrained weights consistently underperforms random initialization\nCHOIX_B: Pretrained weights consistently outperforms random initialization\nCHOIX_C: Pretrained weights have no effect on the performance of the models\nCHOIX_D: The study did not test the use of pretrained weights on 2D-Slice models\n\nRéponse: B\n\nQuestion 2: What is the main hypothesis that the study validates?\nCHOIX_A: Models trained on natural images (2D) cannot be helpful for neuroimaging tasks\nCHOIX_B: Models trained on natural images (2D) can be helpful for neuroimaging tasks\nCHOIX_C: 2D-Slice-CNNs cannot be used for neuroimaging tasks\nCHOIX_D: 2D-Slice-CNNs are the only models that can be used for neuroimaging tasks\n\nRéponse: B"
20
+ # doc = '''question : What was the reason for not asking for the LLM-based condition to show its work in the preliminary work on the paper?
21
+
22
+
23
+ # CHOICE_A: The author thought it would increase the likelihood of transcribing the wrong answer.
24
+ # CHOICE_B: The author wanted to avoid confusing the participant with a lot of numbers.
25
+ # CHOICE_C: The author believed that precise probabilities had nothing to do with the problem.
26
+ # CHOICE_D:The author wanted to use a meta-prompt that didn't require determining precise probabilities.
27
+
28
+
29
+ # reponse: B
30
+
31
+
32
+ # '''
33
+
34
+ doc = 'question: What is the purpose of the get_parsed_value function in the given document?\r\n CHOICE_A: To parse the value based on the given parser and document.\r\n CHOICE_B: To merge the parsed values into the quizz dictionary.\r\n CHOICE_C: To create a new dictionary called parsers.\r\n CHOICE_D: To define a new function called update method.\r\nreponse: A\r\n\r\r'
35
+
36
+ parsers = {
37
+ "question": RegexParser(
38
+ #regex=r"question\s+:\s+\n?(.*?)(?:\n)+",
39
+ regex=r"question:\s*(.*?)\s+(?:\n)+",
40
+ output_keys=["question"]
41
+ ),
42
+ "A": RegexParser(
43
+ regex=r"(?:\n)+\s*CHOICE_A:(.*?)\n+",
44
+ output_keys=["A"]
45
+ ),
46
+ "B": RegexParser(
47
+ regex=r"(?:\n)+\s*CHOICE_B:(.*?)\n+",
48
+ output_keys=["B"]
49
+ ),
50
+ "C": RegexParser(
51
+ regex=r"(?:\n)+\s*CHOICE_C:(.*?)\n+",
52
+ output_keys=["C"]
53
+ ),
54
+ "D": RegexParser(
55
+ regex=r"(?:\n)+\s*CHOICE_D:(.*?)\n+",
56
+ output_keys=["D"]
57
+ ),
58
+ "reponse": RegexParser(
59
+ regex=r"(?:\n)+reponse:\s?(.*)",
60
+ output_keys=["reponse"]
61
+ )
62
+ }
63
+
64
+ def get_parsed_value(parser, key, doc):
65
+ result = parser.parse(doc)
66
+ value = result.get(key).strip()
67
+ return {key: value}
68
+
69
+ quizz = {}
70
+ for key, parser in parsers.items():
71
+ quizz.update(get_parsed_value(parser, key, doc))
72
+
73
+ quizz_list = [quizz]
74
+
75
+ output_parser = RegexParser(
76
+ regex=r"question\s?\d?:\s+\n?(.*?)\n\s*CHOICE_A(.*?)\n\s*CHOICE_B(.*?)\n\s*CHOICE_C(.*?)\n\s*CHOICE_D(.*?)(?:\n)+reponse:\s?(.*)",
77
+ output_keys=["question1", "A_1", "B_1", "C_1", "D_1", "reponse1"]
78
+ )
79
+
80
+ # Use the RegexParser to parse the input string
81
+ output_dict = transform(output_parser.parse(doc))
82
+
83
+ # Print the parsed output
84
+ print(output_dict)
85
+
pdf_to_quizz.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader
2
+ from quizz_generator import generate_quizz
3
+ from langchain.text_splitter import NLTKTextSplitter
4
+ import nltk
5
+ from typing import List
6
+
7
+ nltk.download('punkt')
8
+
9
+ def pdf_to_quizz(pdf_file_name):
10
+
11
+ loader = PyPDFLoader(pdf_file_name)
12
+
13
+ docs = loader.load_and_split(NLTKTextSplitter(chunk_size=700, chunk_overlap=0))
14
+ paragraphs =list(map(lambda doc: doc.page_content.replace("\n", " ").strip(), docs))
15
+
16
+ i = 0
17
+ batch_paragraph : List[str] = []
18
+ for paragraph in paragraphs:
19
+ i+=1
20
+ if i<=10:
21
+ batch_paragraph.append(paragraph)
22
+ else:
23
+ break
24
+
25
+ return generate_quizz(batch_paragraph)
26
+
27
+ # def process_paragraph(paragraph):
28
+ # return generate_quizz(paragraph)
qa_llm.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.llms import HuggingFacePipeline
2
+ import torch
3
+ from torch import cuda
4
+ from transformers import StoppingCriteria, StoppingCriteriaList
5
+ import transformers
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+
8
+ # define custom stopping criteria object
9
+ class StopOnTokens(StoppingCriteria):
10
+
11
+ def __init__(self, tokenizer):
12
+
13
+ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
14
+
15
+ self.stop_token_ids = [
16
+ tokenizer.convert_tokens_to_ids(x) for x in [
17
+ ['</s>'], ['User', ':'], ['system', ':'],
18
+ [tokenizer.convert_ids_to_tokens([9427])[0], ':']
19
+ ]
20
+ ]
21
+
22
+ # We also need to convert these to `LongTensor` objects:
23
+ self.stop_token_ids = [torch.LongTensor(x).to(device) for x in self.stop_token_ids]
24
+
25
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
26
+ for stop_ids in self.stop_token_ids:
27
+ if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
28
+ return True
29
+ return False
30
+
31
+ class QaLlm():
32
+
33
+ def __init__(self) -> None:
34
+
35
+ device = 'cuda:0'
36
+
37
+ model = transformers.AutoModelForCausalLM.from_pretrained(
38
+ 'fbellame/pdf_to_quizz_llama_13B',
39
+ device_map={"": device},
40
+ load_in_4bit=True
41
+ )
42
+
43
+ tokenizer = transformers.AutoTokenizer.from_pretrained("fbellame/pdf_to_quizz_llama_13B", use_fast=False)
44
+
45
+ stopping_criteria = StoppingCriteriaList([StopOnTokens(tokenizer)])
46
+
47
+ generate_text = transformers.pipeline(
48
+ model=model, tokenizer=tokenizer,
49
+ return_full_text=True, # langchain expects the full text
50
+ task='text-generation',
51
+ device_map={"": device},
52
+ # we pass model parameters here too
53
+ stopping_criteria=stopping_criteria, # without this model will ramble
54
+ temperature=0.1, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
55
+ top_p=0.15, # select from top tokens whose probability add up to 15%
56
+ top_k=0, # select from top 0 tokens (because zero, relies on top_p)
57
+ max_new_tokens=500, # max number of tokens to generate in the output
58
+ repetition_penalty=1.2 # without this output begins repeating
59
+ )
60
+
61
+ self.llm = HuggingFacePipeline(pipeline=generate_text)
62
+
63
+ def get_llm(self):
64
+ return self.llm
qcm_chain.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM Chain specifically for generating examples for QCM (Question Choix Multiples) answering."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+ from openllm_chain import OpenLlamaChain
7
+ from langchain.llms.base import BaseLLM
8
+
9
+ from langchain.prompts import PromptTemplate
10
+
11
+ #instruction = """You are a teacher preparing questions for a quiz. Given the following document, please generate 1 multiple-choice questions (MCQs) with 4 options and a corresponding answer letter based on the document. Example question:\nQuestion: question here\nCHOICE_A: choice here\nCHOICE_B: choice here\nCHOICE_C: choice here\nCHOICE_D: choice here\Answer: A or B or C or D\<Begin Document>\n{doc}\n<End Document>"""
12
+ #template = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:"
13
+
14
+ template = """<|prompt|>You are a teacher preparing questions for a quiz. Given the following document, please generate 1 multiple-choice questions (MCQs) with 4 options and a corresponding answer letter based on the document.
15
+ Example question:
16
+ Question: question here
17
+ CHOICE_A: choice here
18
+ CHOICE_B: choice here
19
+ CHOICE_C: choice here
20
+ CHOICE_D: choice here
21
+ Answer: A or B or C or D
22
+ <Begin Document>
23
+ {doc}
24
+ <End Document></s><|answer|>"""
25
+
26
+
27
+ PROMPT = PromptTemplate(
28
+ input_variables=["doc"], template=template)
29
+
30
+ class QCMGenerateChain(OpenLlamaChain):
31
+ """LLM Chain specifically for generating examples for QCM answering."""
32
+
33
+ @classmethod
34
+ def from_llm(cls, llm: BaseLLM, **kwargs: Any) -> QCMGenerateChain:
35
+ """Load QA Generate Chain from LLM."""
36
+ return cls(llm=llm, prompt=PROMPT, **kwargs)
quizz_generator.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qcm_chain import QCMGenerateChain
2
+ from qa_llm import QaLlm
3
+ from langchain.output_parsers.regex import RegexParser
4
+ from typing import List
5
+
6
+ parsers = {
7
+ "question": RegexParser(
8
+ regex=r"question:\s*(.*?)\s+(?:\n)+",
9
+ output_keys=["question"]
10
+ ),
11
+ "A": RegexParser(
12
+ regex=r"(?:\n)+\s*CHOICE_A:(.*?)\n+",
13
+ output_keys=["A"]
14
+ ),
15
+ "B": RegexParser(
16
+ regex=r"(?:\n)+\s*CHOICE_B:(.*?)\n+",
17
+ output_keys=["B"]
18
+ ),
19
+ "C": RegexParser(
20
+ regex=r"(?:\n)+\s*CHOICE_C:(.*?)\n+",
21
+ output_keys=["C"]
22
+ ),
23
+ "D": RegexParser(
24
+ regex=r"(?:\n)+\s*CHOICE_D:(.*?)\n+",
25
+ output_keys=["D"]
26
+ ),
27
+ "reponse": RegexParser(
28
+ regex=r"(?:\n)+reponse:\s?(.*)",
29
+ output_keys=["reponse"]
30
+ )
31
+ }
32
+
33
+ qa_llm = QaLlm()
34
+ qa_chain = QCMGenerateChain.from_llm(qa_llm.get_llm())
35
+
36
+ def llm_call(qa_chain: QCMGenerateChain, texts: List[str]):
37
+
38
+ print(f"llm call running...")
39
+ batch_examples = qa_chain.predict_batch(texts, parsers)
40
+ print(f"llm call done.")
41
+
42
+ return batch_examples
43
+
44
+ def generate_quizz(contents:List[str]):
45
+ """
46
+ Generates a quizz from the given content.
47
+ """
48
+ docs = []
49
+ for content in contents:
50
+ docs.append({"doc": content})
51
+
52
+ return llm_call(qa_chain, docs)
53
+
54
+
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.212
2
+ pypdf==3.7.0
3
+ streamlit==1.21.0
4
+ fpdf==1.7.2
5
+ nltk==3.8.1
6
+ beautifulsoup4==4.12.2
7
+ torch==2.0.1
8
+ torchaudio==2.0.2
9
+ torchvision==0.15.2
10
+ transformers==4.30.2
11
+ accelerate==0.20.3
12
+ xformers==0.0.20
13
+ sentencepiece==0.1.99
14
+ bitsandbytes==0.40.0
text_to_quizz.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from quizz_generator import generate_quizz
2
+
3
+ def txt_to_quizz(content):
4
+
5
+ quizz = generate_quizz(content)
6
+
7
+ return [quizz]
8
+
train_llm.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ prompt,answer
2
+ "You are a teacher preparing questions for a quiz. Given the following document, please generate 1 multiple-choice questions (MCQs) with 4 options and a corresponding answer letter based on the document.\n\nExample question:\n\nQuestion: question here\nCHOICE_A: choice here\nCHOICE_B: choice here\nCHOICE_C: choice here\nCHOICE_D: choice here\nAnswer: A or B or C or D\n\nThese questions should be detailed and solely based on the information provided in the document.\n\n<Begin Document>\nAbstract\nLLM-powered chatbots are becoming widely\nadopted in applications such as healthcare, personal assistants, industry hiring decisions, etc.\nIn many of these cases, chatbots are fed sensitive, personal information in their prompts,\nas samples for in-context learning, retrieved\nrecords from a database or as part of the conversation. The information provided in the\nprompt could directly appear in the output,\nwhich might have privacy ramifications if there\nis sensitive information there. As such, in this\npaper, we aim to understand the input copying\nand regurgitation capabilities of these models during inference and how they can be directly instructed to limit this copying by complying with regulations such as HIPAA and\nGDPR, based on their internal knowledge of\nthem. More specifically, we find that when\nChatGPT is prompted to summarize cover letters of a 100 candidates, it would retain personally identifiable information (PII) verbatim in\n57.4% of cases, and we find this retention to\nbe non-uniform between different subgroups\nof people, based on attributes such as gender\nidentity. We then probe ChatGPT’s perception of privacy-related policies and privatization mechanisms by directly instructing it to\nprovide compliant outputs and observe a significant omission of PII from output.\n<End Document>",Question:\nWhat is one of the concerns mentioned in the document regarding the information provided in the prompts to chatbots?\nA) The use of sensitive information in healthcare applications\nB) The potential retention of personally identifiable information (PII) in the output\nC) The impact of gender identity on chatbot performance\nD) The need for chatbots to comply with regulations such as HIPAA and GDPR\n\nAnswer: B
ui.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from ui_utils import check_password
3
+ from pdf_to_quizz import pdf_to_quizz
4
+ from text_to_quizz import txt_to_quizz
5
+ from generate_pdf import generate_pdf_quiz
6
+ import json
7
+
8
+ import asyncio
9
+
10
+ st.title("PDF to Quiz (:-)(-: )")
11
+
12
+ def build_question(count, json_question):
13
+
14
+ if json_question.get(f"question") is not None:
15
+ st.write("Question: ", json_question.get(f"question", ""))
16
+ choices = ['A', 'B', 'C', 'D']
17
+ selected_answer = st.selectbox(f"Selectionnez votre réponse:", choices, key=f"select_{count}")
18
+ for choice in choices:
19
+ choice_str = json_question.get(f"{choice}", "None")
20
+ st.write(f"{choice} : {choice_str}")
21
+
22
+ color = ""
23
+ if st.button("Soumettre", key=f"button_{count}"):
24
+ rep = json_question.get(f"reponse")
25
+ if selected_answer in rep:
26
+ color = ":green"
27
+ st.write(f":green[Bonne réponse: {rep}]")
28
+
29
+ else:
30
+ color = ":red"
31
+ st.write(f":red[Mauvause réponse. La bonne réponse est {rep}].")
32
+
33
+ st.write(f"{color}[Votre réponse: {selected_answer}]")
34
+
35
+ count += 1
36
+
37
+ return count
38
+
39
+ # Upload PDF file
40
+ uploaded_file = st.file_uploader(":female-student:", type=["pdf"])
41
+ txt = st.text_area('Taper le texte à partir duquel vous voulez générer le quizz')
42
+
43
+ if st.button("Générer Quiz", key=f"button_generer"):
44
+ if txt is not None:
45
+ with st.spinner("Génération du quizz..."):
46
+ st.session_state['questions'] = txt_to_quizz(txt)
47
+ st.write("Quizz généré avec succès!")
48
+
49
+ if uploaded_file is not None:
50
+ old_file_name = st.session_state.get('uploaded_file_name', None)
51
+ if (old_file_name != uploaded_file.name):
52
+ # Convert PDF to text
53
+ with st.spinner("Génération du quizz..."):
54
+
55
+ with open(f"data/{uploaded_file.name}", "wb") as f:
56
+ f.write(uploaded_file.getvalue())
57
+
58
+ # Initialize session state
59
+ st.session_state['uploaded_file_name'] = uploaded_file.name
60
+ st.session_state['questions'] = pdf_to_quizz(f"data/{uploaded_file.name}")
61
+
62
+ st.write("Quizz généré avec succès!")
63
+
64
+ if ('questions' in st.session_state):
65
+ # Display question
66
+ count = 0
67
+ for json_question in st.session_state['questions']:
68
+
69
+ count = build_question(count, json_question)
70
+
71
+ # generate pdf quiz
72
+ if st.button("Générer PDF Quiz", key=f"button_generer_quiz"):
73
+ with st.spinner("Génération du quizz en PDF..."):
74
+ json_questions = st.session_state['questions']
75
+ # save into a file
76
+ file_name = uploaded_file.name
77
+
78
+ # remove extension .pdf from file name
79
+ if file_name.endswith(".pdf"):
80
+ file_name = file_name[:-4]
81
+
82
+ with open(f"data/quiz-{file_name}.json", "w", encoding='latin-1', errors='ignore') as f:
83
+ str = json.dumps(json_questions)
84
+ f.write(str)
85
+
86
+ generate_pdf_quiz(f"data/quiz-{file_name}.json", json_questions)
87
+
88
+ st.write("PDF Quiz généré avec succés!")
ui_utils.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def check_password():
4
+ """Returns `True` if the user had the correct password."""
5
+
6
+ def password_entered():
7
+ """Checks whether a password entered by the user is correct."""
8
+ if st.session_state["password"] == st.secrets["password"]:
9
+ st.session_state["password_correct"] = True
10
+ del st.session_state["password"] # don't store password
11
+ else:
12
+ st.session_state["password_correct"] = False
13
+
14
+ if "password_correct" not in st.session_state:
15
+ # First run, show input for password.
16
+ st.text_input(
17
+ "Password", type="password", on_change=password_entered, key="password"
18
+ )
19
+ return False
20
+ elif not st.session_state["password_correct"]:
21
+ # Password not correct, show input + error.
22
+ st.text_input(
23
+ "Password", type="password", on_change=password_entered, key="password"
24
+ )
25
+ st.error("😕 Password incorrect")
26
+ return False
27
+ else:
28
+ # Password correct.
29
+ return True
30
+
31
+ def transform(input_list):
32
+ new_list = []
33
+ for item in input_list:
34
+ for key in item:
35
+ if 'question1' in key or 'question2' in key or 'question3' in key:
36
+ question_dict = {}
37
+ question_num = key[-1]
38
+ question_dict[f'question'] = item[key]
39
+ question_dict[f'A'] = item[f'A_{question_num}']
40
+ question_dict[f'B'] = item[f'B_{question_num}']
41
+ question_dict[f'C'] = item[f'C_{question_num}']
42
+ question_dict[f'D'] = item[f'D_{question_num}']
43
+ question_dict[f'reponse'] = item[f'reponse{question_num}']
44
+ new_list.append(question_dict)
45
+ return new_list