Spaces:

DrishtiSharma
/

chat-w-google-patents

Running

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

5790c53

verified ·

1 Parent(s): 3a20b56

Update patent_downloader.py

Browse files

Files changed (1) hide show

patent_downloader.py +44 -5

patent_downloader.py CHANGED Viewed

@@ -11,8 +11,12 @@ from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from bs4 import BeautifulSoup
 import chromedriver_autoinstaller
 class PatentDownloader:
     url = "https://patents.google.com"
@@ -54,7 +58,7 @@ class PatentDownloader:
         return chrome_path
     def download(self, patent: Union[str, List[str]], output_path: str = "./",
-                 waiting_time: int = 6, remove_kind_codes: Optional[List[str]] = None) -> None:
         """
         Download patent document(s) as PDF.
         """
@@ -63,7 +67,7 @@ class PatentDownloader:
         else:
             self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
-    def get_pdf(self, patent: str, output_path: str = "./", waiting_time: int = 6,
                 remove_kind_codes: Optional[List[str]] = None) -> None:
         """
         Download a single patent PDF.
@@ -85,19 +89,35 @@ class PatentDownloader:
         # Initialize Selenium WebDriver
         service = Service()
         driver = webdriver.Chrome(service=service, options=chrome_options)
-        driver.get(self.url)
         try:
-            # Search for the patent
-            element = driver.find_element("css selector", "input[type='search']")
             element.send_keys(patent)
             element.send_keys(Keys.RETURN)
             time.sleep(waiting_time)
             # Parse HTML and get the PDF link
             soup = BeautifulSoup(driver.page_source, "html.parser")
             pdf_link = self.get_pdf_link(soup, patent)
         finally:
             driver.quit()
@@ -111,6 +131,25 @@ class PatentDownloader:
         else:
             print(f"Error: PDF link for patent {patent} not found!")
     @staticmethod
     def get_pdf_link(soup: BeautifulSoup, patent: str) -> Optional[str]:
         """

 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 import chromedriver_autoinstaller
 class PatentDownloader:
     url = "https://patents.google.com"
         return chrome_path
     def download(self, patent: Union[str, List[str]], output_path: str = "./",
+                 waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
         """
         Download patent document(s) as PDF.
         """
         else:
             self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
+    def get_pdf(self, patent: str, output_path: str = "./", waiting_time: int = 10,
                 remove_kind_codes: Optional[List[str]] = None) -> None:
         """
         Download a single patent PDF.
         # Initialize Selenium WebDriver
         service = Service()
         driver = webdriver.Chrome(service=service, options=chrome_options)
+        pdf_link = None  # Ensure pdf_link is defined
         try:
+            driver.get(self.url)
+            # Wait for the search input field and interact with it
+            print("Waiting for the search input field...")
+            search_input_xpath = "//input[@aria-label='Search patents']"
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.XPATH, search_input_xpath))
+            )
+            element = driver.find_element(By.XPATH, search_input_xpath)
+            print("Search input field located.")
             element.send_keys(patent)
             element.send_keys(Keys.RETURN)
+            # Wait for search results to load
+            print("Waiting for search results to load...")
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.TAG_NAME, "body"))
+            )
             time.sleep(waiting_time)
             # Parse HTML and get the PDF link
             soup = BeautifulSoup(driver.page_source, "html.parser")
             pdf_link = self.get_pdf_link(soup, patent)
+        except Exception as e:
+            print(f"Error occurred: {e}")
         finally:
             driver.quit()
         else:
             print(f"Error: PDF link for patent {patent} not found!")
+    def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./",
+                 waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
+        """
+        Download multiple patent PDFs from a list or file.
+        """
+        if isinstance(patents, str):
+            if patents.lower().endswith('csv'):
+                df_patents = pd.read_csv(patents)
+                patents = df_patents['patent_number'].to_list()
+            elif patents.lower().endswith('txt'):
+                with open(patents, 'r') as txt_file:
+                    patents = txt_file.read().splitlines()
+            else:
+                raise NotImplementedError(f'Unsupported file type: {patents}')
+        for i, patent in enumerate(patents):
+            print(len(patents) - i, "patent(s) remaining.")
+            self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
     @staticmethod
     def get_pdf_link(soup: BeautifulSoup, patent: str) -> Optional[str]:
         """