DrishtiSharma commited on
Commit
5790c53
·
verified ·
1 Parent(s): 3a20b56

Update patent_downloader.py

Browse files
Files changed (1) hide show
  1. patent_downloader.py +44 -5
patent_downloader.py CHANGED
@@ -11,8 +11,12 @@ from selenium.webdriver.common.keys import Keys
11
  from selenium.webdriver.chrome.service import Service
12
  from selenium.webdriver.chrome.options import Options
13
  from bs4 import BeautifulSoup
 
 
 
14
  import chromedriver_autoinstaller
15
 
 
16
  class PatentDownloader:
17
  url = "https://patents.google.com"
18
 
@@ -54,7 +58,7 @@ class PatentDownloader:
54
  return chrome_path
55
 
56
  def download(self, patent: Union[str, List[str]], output_path: str = "./",
57
- waiting_time: int = 6, remove_kind_codes: Optional[List[str]] = None) -> None:
58
  """
59
  Download patent document(s) as PDF.
60
  """
@@ -63,7 +67,7 @@ class PatentDownloader:
63
  else:
64
  self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
65
 
66
- def get_pdf(self, patent: str, output_path: str = "./", waiting_time: int = 6,
67
  remove_kind_codes: Optional[List[str]] = None) -> None:
68
  """
69
  Download a single patent PDF.
@@ -85,19 +89,35 @@ class PatentDownloader:
85
  # Initialize Selenium WebDriver
86
  service = Service()
87
  driver = webdriver.Chrome(service=service, options=chrome_options)
88
- driver.get(self.url)
89
 
90
  try:
91
- # Search for the patent
92
- element = driver.find_element("css selector", "input[type='search']")
 
 
 
 
 
 
 
 
93
 
94
  element.send_keys(patent)
95
  element.send_keys(Keys.RETURN)
 
 
 
 
 
 
96
  time.sleep(waiting_time)
97
 
98
  # Parse HTML and get the PDF link
99
  soup = BeautifulSoup(driver.page_source, "html.parser")
100
  pdf_link = self.get_pdf_link(soup, patent)
 
 
101
  finally:
102
  driver.quit()
103
 
@@ -111,6 +131,25 @@ class PatentDownloader:
111
  else:
112
  print(f"Error: PDF link for patent {patent} not found!")
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  @staticmethod
115
  def get_pdf_link(soup: BeautifulSoup, patent: str) -> Optional[str]:
116
  """
 
11
  from selenium.webdriver.chrome.service import Service
12
  from selenium.webdriver.chrome.options import Options
13
  from bs4 import BeautifulSoup
14
+ from selenium.webdriver.common.by import By
15
+ from selenium.webdriver.support.ui import WebDriverWait
16
+ from selenium.webdriver.support import expected_conditions as EC
17
  import chromedriver_autoinstaller
18
 
19
+
20
  class PatentDownloader:
21
  url = "https://patents.google.com"
22
 
 
58
  return chrome_path
59
 
60
  def download(self, patent: Union[str, List[str]], output_path: str = "./",
61
+ waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
62
  """
63
  Download patent document(s) as PDF.
64
  """
 
67
  else:
68
  self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
69
 
70
+ def get_pdf(self, patent: str, output_path: str = "./", waiting_time: int = 10,
71
  remove_kind_codes: Optional[List[str]] = None) -> None:
72
  """
73
  Download a single patent PDF.
 
89
  # Initialize Selenium WebDriver
90
  service = Service()
91
  driver = webdriver.Chrome(service=service, options=chrome_options)
92
+ pdf_link = None # Ensure pdf_link is defined
93
 
94
  try:
95
+ driver.get(self.url)
96
+
97
+ # Wait for the search input field and interact with it
98
+ print("Waiting for the search input field...")
99
+ search_input_xpath = "//input[@aria-label='Search patents']"
100
+ WebDriverWait(driver, 20).until(
101
+ EC.presence_of_element_located((By.XPATH, search_input_xpath))
102
+ )
103
+ element = driver.find_element(By.XPATH, search_input_xpath)
104
+ print("Search input field located.")
105
 
106
  element.send_keys(patent)
107
  element.send_keys(Keys.RETURN)
108
+
109
+ # Wait for search results to load
110
+ print("Waiting for search results to load...")
111
+ WebDriverWait(driver, 20).until(
112
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
113
+ )
114
  time.sleep(waiting_time)
115
 
116
  # Parse HTML and get the PDF link
117
  soup = BeautifulSoup(driver.page_source, "html.parser")
118
  pdf_link = self.get_pdf_link(soup, patent)
119
+ except Exception as e:
120
+ print(f"Error occurred: {e}")
121
  finally:
122
  driver.quit()
123
 
 
131
  else:
132
  print(f"Error: PDF link for patent {patent} not found!")
133
 
134
+ def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./",
135
+ waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
136
+ """
137
+ Download multiple patent PDFs from a list or file.
138
+ """
139
+ if isinstance(patents, str):
140
+ if patents.lower().endswith('csv'):
141
+ df_patents = pd.read_csv(patents)
142
+ patents = df_patents['patent_number'].to_list()
143
+ elif patents.lower().endswith('txt'):
144
+ with open(patents, 'r') as txt_file:
145
+ patents = txt_file.read().splitlines()
146
+ else:
147
+ raise NotImplementedError(f'Unsupported file type: {patents}')
148
+
149
+ for i, patent in enumerate(patents):
150
+ print(len(patents) - i, "patent(s) remaining.")
151
+ self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
152
+
153
  @staticmethod
154
  def get_pdf_link(soup: BeautifulSoup, patent: str) -> Optional[str]:
155
  """