File size: 2,287 Bytes
372531f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
import requests
import tempfile
from urllib.parse import urlparse
from langchain_community.document_loaders import PyMuPDFLoader


class PyMuPDFScraper:

    def __init__(self, link, session=None):
        """

        Initialize the scraper with a link and an optional session.



        Args:

          link (str): The URL or local file path of the PDF document.

          session (requests.Session, optional): An optional session for making HTTP requests.

        """
        self.link = link
        self.session = session

    def is_url(self) -> bool:
        """

        Check if the provided `link` is a valid URL.



        Returns:

          bool: True if the link is a valid URL, False otherwise.

        """
        try:
            result = urlparse(self.link)
            return all([result.scheme, result.netloc])  # Check for valid scheme and network location
        except Exception:
            return False

    def scrape(self) -> str:
        """

        The `scrape` function uses PyMuPDFLoader to load a document from the provided link (either URL or local file)

        and returns the document as a string.



        Returns:

          str: A string representation of the loaded document.

        """
        try:
            if self.is_url():
                response = requests.get(self.link, timeout=5, stream=True)
                response.raise_for_status()

                with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
                    temp_filename = temp_file.name  # Get the temporary file name
                    for chunk in response.iter_content(chunk_size=8192):
                        temp_file.write(chunk)  # Write the downloaded content to the temporary file

                loader = PyMuPDFLoader(temp_filename)
                doc = loader.load()

                os.remove(temp_filename)
            else:
                loader = PyMuPDFLoader(self.link)
                doc = loader.load()

            return str(doc)

        except requests.exceptions.Timeout:
            print(f"Download timed out. Please check the link : {self.link}")
        except Exception as e:
            print(f"Error loading PDF : {self.link} {e}")