DrishtiSharma commited on
Commit
a417f74
·
verified ·
1 Parent(s): 130d692

Create patent_downloader.py

Browse files
Files changed (1) hide show
  1. patent_downloader.py +131 -0
patent_downloader.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union, Optional
2
+ import os
3
+ import requests
4
+ import re
5
+ import time
6
+ import shutil
7
+ import subprocess
8
+ import pandas as pd
9
+ from selenium import webdriver
10
+ from selenium.webdriver.common.keys import Keys
11
+ from selenium.webdriver.chrome.service import Service
12
+ from selenium.webdriver.chrome.options import Options
13
+ from bs4 import BeautifulSoup
14
+ import chromedriver_autoinstaller
15
+
16
+ class PatentDownloader:
17
+ url = "https://patents.google.com"
18
+
19
+ def __init__(self, verbose: bool = False):
20
+ """
21
+ Parameters
22
+ ----------
23
+ verbose : bool
24
+ Print additional debug information.
25
+ """
26
+ self.verbose = verbose
27
+ self.chrome_path = self.install_chrome()
28
+
29
+ def install_chrome(self) -> str:
30
+ """
31
+ Download and install Google Chrome dynamically.
32
+ Returns
33
+ -------
34
+ str: Path to the Chrome binary.
35
+ """
36
+ chrome_path = "/usr/bin/google-chrome"
37
+
38
+ if not shutil.which("google-chrome"):
39
+ print("Downloading and installing Google Chrome...")
40
+ subprocess.run(
41
+ "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
42
+ shell=True,
43
+ check=True,
44
+ )
45
+ subprocess.run(
46
+ "apt-get update && apt-get install -y ./chrome.deb",
47
+ shell=True,
48
+ check=True,
49
+ )
50
+ os.remove("chrome.deb")
51
+
52
+ if not shutil.which("google-chrome"):
53
+ raise ValueError("Google Chrome installation failed!")
54
+ return chrome_path
55
+
56
+ def download(self, patent: Union[str, List[str]], output_path: str = "./",
57
+ waiting_time: int = 6, remove_kind_codes: Optional[List[str]] = None) -> None:
58
+ """
59
+ Download patent document(s) as PDF.
60
+ """
61
+ if isinstance(patent, list) or os.path.isfile(patent):
62
+ self.get_pdfs(patent, output_path, waiting_time, remove_kind_codes)
63
+ else:
64
+ self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
65
+
66
+ def get_pdf(self, patent: str, output_path: str = "./", waiting_time: int = 6,
67
+ remove_kind_codes: Optional[List[str]] = None) -> None:
68
+ """
69
+ Download a single patent PDF.
70
+ """
71
+ if remove_kind_codes:
72
+ for kind_code in remove_kind_codes:
73
+ patent = re.sub(kind_code + "$", "", patent)
74
+
75
+ # Automatically install ChromeDriver
76
+ chromedriver_autoinstaller.install()
77
+
78
+ # Set up Chrome options
79
+ chrome_options = Options()
80
+ chrome_options.binary_location = self.chrome_path
81
+ chrome_options.add_argument("--headless")
82
+ chrome_options.add_argument("--no-sandbox")
83
+ chrome_options.add_argument("--disable-dev-shm-usage")
84
+
85
+ # Initialize Selenium WebDriver
86
+ service = Service()
87
+ driver = webdriver.Chrome(service=service, options=chrome_options)
88
+ driver.get(self.url)
89
+
90
+ try:
91
+ # Search for the patent
92
+ element = driver.find_element("css selector", "input[type='search']")
93
+
94
+ element.send_keys(patent)
95
+ element.send_keys(Keys.RETURN)
96
+ time.sleep(waiting_time)
97
+
98
+ # Parse HTML and get the PDF link
99
+ soup = BeautifulSoup(driver.page_source, "html.parser")
100
+ pdf_link = self.get_pdf_link(soup, patent)
101
+ finally:
102
+ driver.quit()
103
+
104
+ # Download the PDF
105
+ if pdf_link:
106
+ validate_directory(output_path)
107
+ pdf_content = requests.get(pdf_link).content
108
+ with open(os.path.join(output_path, f"{patent}.pdf"), "wb") as file:
109
+ file.write(pdf_content)
110
+ print(f">>> Patent {patent} successfully downloaded <<<")
111
+ else:
112
+ print(f"Error: PDF link for patent {patent} not found!")
113
+
114
+ @staticmethod
115
+ def get_pdf_link(soup: BeautifulSoup, patent: str) -> Optional[str]:
116
+ """
117
+ Extract the PDF link from parsed HTML.
118
+ """
119
+ pdf_links = [link['href'] for link in soup.find_all('a', href=True) if link['href'].lower().endswith("pdf")]
120
+ for link in pdf_links:
121
+ if patent.lower() in link.lower():
122
+ return link
123
+ return None
124
+
125
+
126
+ def validate_directory(directory: str) -> None:
127
+ """
128
+ Ensure the output directory exists.
129
+ """
130
+ if not os.path.exists(directory):
131
+ os.makedirs(directory)