Spaces:
Running
Running
File size: 860 Bytes
372531f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.retrievers import ArxivRetriever
def scrape_pdf_with_pymupdf(url) -> str:
"""Scrape a pdf with pymupdf
Args:
url (str): The url of the pdf to scrape
Returns:
str: The text scraped from the pdf
"""
loader = PyMuPDFLoader(url)
doc = loader.load()
return str(doc)
def scrape_pdf_with_arxiv(query) -> str:
"""Scrape a pdf with arxiv
default document length of 70000 about ~15 pages or None for no limit
Args:
query (str): The query to search for
Returns:
str: The text scraped from the pdf
"""
retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None)
docs = retriever.get_relevant_documents(query=query)
return docs[0].page_content |