Thomas (Tom) Gardos commited on
Commit
1d7972f
·
2 Parent(s): b3bf29c 9a544d2

Merge pull request #22 from DL4DS/text_extraction

Browse files
code/modules/config/config.yml CHANGED
@@ -34,6 +34,7 @@ llm_params:
34
  local_llm_params:
35
  model: 'tiny-llama'
36
  temperature: 0.7
 
37
 
38
  chat_logging:
39
  log_chat: False # bool
 
34
  local_llm_params:
35
  model: 'tiny-llama'
36
  temperature: 0.7
37
+ pdf_reader: 'llama' # str [llama, pymupdf]
38
 
39
  chat_logging:
40
  log_chat: False # bool
code/modules/config/constants.py CHANGED
@@ -6,6 +6,7 @@ load_dotenv()
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
9
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
10
  LITERAL_API_KEY = os.getenv("LITERAL_API_KEY")
11
 
@@ -14,7 +15,8 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
14
  # Prompt Templates
15
 
16
  openai_prompt_template = """Use the following pieces of information to answer the user's question.
17
- If you don't know the answer, just say that you don't know.
 
18
 
19
  Context: {context}
20
  Question: {question}
@@ -24,7 +26,10 @@ Helpful answer:
24
  """
25
 
26
  openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
 
 
27
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
28
  Use the history to answer the question if you can.
29
  Chat History:
30
  {chat_history}
@@ -37,7 +42,7 @@ Helpful answer:
37
 
38
  tinyllama_prompt_template = """
39
  <|im_start|>system
40
- Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question. Use the history to answer the question if you can.
41
 
42
  Context:
43
  {context}
@@ -56,7 +61,7 @@ Question: {question}
56
 
57
  tinyllama_prompt_template_with_history = """
58
  <|im_start|>system
59
- Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question.
60
 
61
  Chat History:
62
  {chat_history}
 
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
9
+ LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
10
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
11
  LITERAL_API_KEY = os.getenv("LITERAL_API_KEY")
12
 
 
15
  # Prompt Templates
16
 
17
  openai_prompt_template = """Use the following pieces of information to answer the user's question.
18
+ You are an intelligent chatbot designed to help students with questions regarding the course. Render math equations in LaTeX format between $$ signs, and explain the parameters and variables in the equations.
19
+ If you don't know the answer, just say that you don't know.
20
 
21
  Context: {context}
22
  Question: {question}
 
26
  """
27
 
28
  openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
29
+ You are an intelligent chatbot designed to help students with questions regarding the course. Render math equations in LaTeX format between $$ signs.
30
+
31
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
32
+
33
  Use the history to answer the question if you can.
34
  Chat History:
35
  {chat_history}
 
42
 
43
  tinyllama_prompt_template = """
44
  <|im_start|>system
45
+ Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a brief and concise answer to the question. When asked for formulas, give a brief description of the formula and output math equations in LaTeX format between $ signs.
46
 
47
  Context:
48
  {context}
 
61
 
62
  tinyllama_prompt_template_with_history = """
63
  <|im_start|>system
64
+ Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a brief and concise answer to the question. Output math equations in LaTeX format between $ signs. Use the history to answer the question if you can.
65
 
66
  Chat History:
67
  {chat_history}
code/modules/dataloader/data_loader.py CHANGED
@@ -20,10 +20,24 @@ from langchain_community.llms import OpenAI
20
  from langchain import PromptTemplate
21
  import json
22
  from concurrent.futures import ThreadPoolExecutor
 
 
 
 
 
23
 
24
- from modules.dataloader.helpers import get_metadata
 
 
25
 
26
 
 
 
 
 
 
 
 
27
  class PDFReader:
28
  def __init__(self):
29
  pass
@@ -35,11 +49,132 @@ class PDFReader:
35
  def get_documents(self, loader):
36
  return loader.load()
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  class FileReader:
40
- def __init__(self, logger):
41
- self.pdf_reader = PDFReader()
42
  self.logger = logger
 
 
 
 
 
 
 
43
 
44
  def extract_text_from_pdf(self, pdf_path):
45
  text = ""
@@ -51,7 +186,8 @@ class FileReader:
51
  text += page.extract_text()
52
  return text
53
 
54
- def download_pdf_from_url(self, pdf_url):
 
55
  response = requests.get(pdf_url)
56
  if response.status_code == 200:
57
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
@@ -63,8 +199,11 @@ class FileReader:
63
  return None
64
 
65
  def read_pdf(self, temp_file_path: str):
66
- loader = self.pdf_reader.get_loader(temp_file_path)
67
- documents = self.pdf_reader.get_documents(loader)
 
 
 
68
  return documents
69
 
70
  def read_txt(self, temp_file_path: str):
@@ -179,7 +318,6 @@ class ChunkProcessor:
179
  "https://dl4ds.github.io/sp2024/lectures/",
180
  "https://dl4ds.github.io/sp2024/schedule/",
181
  ) # For any additional metadata
182
-
183
  with ThreadPoolExecutor() as executor:
184
  executor.map(
185
  self.process_file,
@@ -250,11 +388,17 @@ class ChunkProcessor:
250
 
251
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
252
  file_name = os.path.basename(file_path)
 
 
 
 
 
 
253
  if file_name in self.document_data:
254
  return
255
 
256
  file_type = file_name.split(".")[-1].lower()
257
- self.logger.info(f"Reading file {file_index + 1}: {file_path}")
258
 
259
  read_methods = {
260
  "pdf": file_reader.read_pdf,
@@ -268,9 +412,9 @@ class ChunkProcessor:
268
  return
269
 
270
  try:
271
- documents = read_methods[file_type](file_path)
272
  self.process_documents(
273
- documents, file_path, file_type, "file", addl_metadata
274
  )
275
  except Exception as e:
276
  self.logger.error(f"Error processing file {file_name}: {str(e)}")
@@ -330,7 +474,7 @@ class ChunkProcessor:
330
 
331
  class DataLoader:
332
  def __init__(self, config, logger=None):
333
- self.file_reader = FileReader(logger=logger)
334
  self.chunk_processor = ChunkProcessor(config, logger=logger)
335
 
336
  def get_chunks(self, uploaded_files, weblinks):
@@ -348,10 +492,15 @@ if __name__ == "__main__":
348
  with open("../code/modules/config/config.yml", "r") as f:
349
  config = yaml.safe_load(f)
350
 
 
 
 
 
 
351
  data_loader = DataLoader(config, logger=logger)
352
  document_chunks, document_names, documents, document_metadata = (
353
  data_loader.get_chunks(
354
- [],
355
  ["https://dl4ds.github.io/sp2024/"],
356
  )
357
  )
 
20
  from langchain import PromptTemplate
21
  import json
22
  from concurrent.futures import ThreadPoolExecutor
23
+ from urllib.parse import urljoin
24
+ import html2text
25
+ import bs4
26
+ import tempfile
27
+ import PyPDF2
28
 
29
+ try:
30
+ from modules.dataloader.helpers import get_metadata
31
+ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
32
 
33
 
34
+ except:
35
+ from dataloader.helpers import get_metadata
36
+ from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
37
+
38
+ logger = logging.getLogger(__name__)
39
+ BASE_DIR = os.getcwd()
40
+
41
  class PDFReader:
42
  def __init__(self):
43
  pass
 
49
  def get_documents(self, loader):
50
  return loader.load()
51
 
52
+ class LlamaParser:
53
+ def __init__(self):
54
+ self.GPT_API_KEY = OPENAI_API_KEY
55
+ self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
56
+ self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
57
+ self.headers = {
58
+ 'Accept': 'application/json',
59
+ 'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
60
+ }
61
+ self.parser = LlamaParse(
62
+ api_key=LLAMA_CLOUD_API_KEY,
63
+ result_type="markdown",
64
+ verbose=True,
65
+ language="en",
66
+ gpt4o_mode=False,
67
+ # gpt4o_api_key=OPENAI_API_KEY,
68
+ parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source."
69
+ )
70
+
71
+ def parse(self, pdf_path):
72
+ pdf_name = os.path.basename(pdf_path)
73
+
74
+ documents = self.parser.load_data(pdf_path)
75
+ documents = [document.to_langchain_format() for document in documents]
76
+
77
+ os.remove(pdf_path) # cleanup, just in case
78
+ return documents
79
+
80
+ def make_request(self, pdf_url):
81
+ payload = {
82
+ "gpt4o_mode": "false",
83
+ "parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
84
+ }
85
+
86
+ files = [
87
+ ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
88
+ ]
89
+
90
+ response = requests.request(
91
+ "POST", self.parse_url, headers=self.headers, data=payload, files=files)
92
+
93
+ return response.json()['id'], response.json()['status']
94
+
95
+ async def get_result(self, job_id):
96
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
97
+
98
+ response = requests.request("GET", url, headers=self.headers, data={})
99
+
100
+ return response.json()['markdown']
101
+
102
+ async def _parse(self, pdf_path):
103
+ job_id, status = self.make_request(pdf_path)
104
+
105
+ while status != "SUCCESS":
106
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
107
+ response = requests.request("GET", url, headers=self.headers, data={})
108
+ status = response.json()["status"]
109
+
110
+ result = await self.get_result(job_id)
111
+
112
+ documents = [
113
+ Document(
114
+ page_content=result,
115
+ metadata={"source": pdf_path}
116
+ )
117
+ ]
118
+
119
+ return documents
120
+
121
+ async def _parse(self, pdf_path):
122
+ return await self._parse(pdf_path)
123
+
124
+ class HTMLReader:
125
+ def __init__(self):
126
+ pass
127
+
128
+ def read_url(self, url):
129
+ response = requests.get(url)
130
+ if response.status_code == 200:
131
+ return response.text
132
+ else:
133
+ logger.warning(f"Failed to download HTML from URL: {url}")
134
+ return None
135
+
136
+ def check_links(self, base_url, html_content):
137
+ soup = bs4.BeautifulSoup(html_content, "html.parser")
138
+ for link in soup.find_all("a"):
139
+ href = link.get("href")
140
+
141
+ if not href or href.startswith("#"):
142
+ continue
143
+ elif not href.startswith("https"):
144
+ href = href.replace("http", "https")
145
+
146
+ absolute_url = urljoin(base_url, href)
147
+ link['href'] = absolute_url
148
+
149
+ resp = requests.head(absolute_url)
150
+ if resp.status_code != 200:
151
+ logger.warning(f"Link {absolute_url} is broken")
152
+ logger.warning(f"Status code: {resp.status_code}")
153
+
154
+ return str(soup)
155
+
156
+ def html_to_md(self, url, html_content):
157
+ html_processed = self.check_links(url, html_content)
158
+ markdown_content = html2text.html2text(html_processed)
159
+ return markdown_content
160
+
161
+ def read_html(self, url):
162
+ html_content = self.read_url(url)
163
+ if html_content:
164
+ return self.html_to_md(url, html_content)
165
+ else:
166
+ return None
167
 
168
  class FileReader:
169
+ def __init__(self, logger, kind):
 
170
  self.logger = logger
171
+ self.kind = kind
172
+ if kind == "llama":
173
+ self.pdf_reader = LlamaParser()
174
+ else:
175
+ self.pdf_reader = PDFReader()
176
+ self.web_reader = HTMLReader()
177
+
178
 
179
  def extract_text_from_pdf(self, pdf_path):
180
  text = ""
 
186
  text += page.extract_text()
187
  return text
188
 
189
+ @staticmethod
190
+ def download_pdf_from_url(pdf_url):
191
  response = requests.get(pdf_url)
192
  if response.status_code == 200:
193
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
 
199
  return None
200
 
201
  def read_pdf(self, temp_file_path: str):
202
+ if self.kind == "llama":
203
+ documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
204
+ else:
205
+ loader = self.pdf_reader.get_loader(temp_file_path)
206
+ documents = self.pdf_reader.get_documents(loader)
207
  return documents
208
 
209
  def read_txt(self, temp_file_path: str):
 
318
  "https://dl4ds.github.io/sp2024/lectures/",
319
  "https://dl4ds.github.io/sp2024/schedule/",
320
  ) # For any additional metadata
 
321
  with ThreadPoolExecutor() as executor:
322
  executor.map(
323
  self.process_file,
 
388
 
389
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
390
  file_name = os.path.basename(file_path)
391
+ storage_dir = os.path.join(os.getcwd(), self.config["vectorstore"]["data_path"])
392
+ local_path = os.path.join(storage_dir, file_name)
393
+
394
+ if not os.path.exists(local_path):
395
+ local_path = FileReader.download_pdf_from_url(pdf_url=file_path)
396
+
397
  if file_name in self.document_data:
398
  return
399
 
400
  file_type = file_name.split(".")[-1].lower()
401
+ self.logger.info(f"Reading file {file_index + 1}: {local_path}")
402
 
403
  read_methods = {
404
  "pdf": file_reader.read_pdf,
 
412
  return
413
 
414
  try:
415
+ documents = read_methods[file_type](local_path)
416
  self.process_documents(
417
+ documents, local_path, file_type, "file", addl_metadata
418
  )
419
  except Exception as e:
420
  self.logger.error(f"Error processing file {file_name}: {str(e)}")
 
474
 
475
  class DataLoader:
476
  def __init__(self, config, logger=None):
477
+ self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
478
  self.chunk_processor = ChunkProcessor(config, logger=logger)
479
 
480
  def get_chunks(self, uploaded_files, weblinks):
 
492
  with open("../code/modules/config/config.yml", "r") as f:
493
  config = yaml.safe_load(f)
494
 
495
+ STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
496
+ uploaded_files = [
497
+ os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
498
+ ]
499
+
500
  data_loader = DataLoader(config, logger=logger)
501
  document_chunks, document_names, documents, document_metadata = (
502
  data_loader.get_chunks(
503
+ uploaded_files,
504
  ["https://dl4ds.github.io/sp2024/"],
505
  )
506
  )
code/modules/dataloader/webpage_crawler.py CHANGED
@@ -66,7 +66,6 @@ class WebpageCrawler:
66
  )
67
  for link in unchecked_links:
68
  dict_links[link] = "Checked"
69
- print(f"Checked: {link}")
70
  dict_links.update(
71
  {
72
  link: "Not-checked"
 
66
  )
67
  for link in unchecked_links:
68
  dict_links[link] = "Checked"
 
69
  dict_links.update(
70
  {
71
  link: "Not-checked"