rithvik213 commited on
Commit
f286746
·
1 Parent(s): 6804125

added image scraper

Browse files
Files changed (1) hide show
  1. image_scraper.py +221 -0
image_scraper.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import os
4
+ import json
5
+ import re
6
+ from typing import List, Dict
7
+ import logging
8
+ from urllib.parse import urljoin, urlparse
9
+
10
+ class DigitalCommonwealthScraper:
11
+ def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
12
+ """
13
+ Initialize the scraper with base URL and logging
14
+
15
+ :param base_url: Base URL for Digital Commonwealth
16
+ """
17
+ self.base_url = base_url
18
+ logging.basicConfig(level=logging.INFO)
19
+ self.logger = logging.getLogger(__name__)
20
+
21
+ # Headers to mimic browser request
22
+ self.headers = {
23
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
24
+ }
25
+
26
+ def fetch_page(self, url: str) -> requests.Response:
27
+ """
28
+ Fetch webpage content with error handling
29
+
30
+ :param url: URL to fetch
31
+ :return: Response object
32
+ """
33
+ try:
34
+ response = requests.get(url, headers=self.headers)
35
+ response.raise_for_status()
36
+ return response
37
+ except requests.RequestException as e:
38
+ self.logger.error(f"Error fetching {url}: {e}")
39
+ return None
40
+
41
+ def extract_json_metadata(self, url: str) -> Dict:
42
+ """
43
+ Extract JSON metadata from the page
44
+
45
+ :param url: URL of the page
46
+ :return: Dictionary of metadata
47
+ """
48
+ json_url = f"{url}.json"
49
+ response = self.fetch_page(json_url)
50
+
51
+ if response:
52
+ try:
53
+ return response.json()
54
+ except json.JSONDecodeError:
55
+ self.logger.error(f"Could not parse JSON from {json_url}")
56
+ return {}
57
+ return {}
58
+
59
+ def extract_images(self, url: str) -> List[Dict]:
60
+ """
61
+ Extract images from the page
62
+
63
+ :param url: URL of the page to scrape
64
+ :return: List of image dictionaries
65
+ """
66
+ # Fetch page content
67
+ response = self.fetch_page(url)
68
+ if not response:
69
+ return []
70
+
71
+ # Parse HTML
72
+ soup = BeautifulSoup(response.text, 'html.parser')
73
+
74
+ # Extract JSON metadata
75
+ metadata = self.extract_json_metadata(url)
76
+
77
+ # List to store images
78
+ images = []
79
+
80
+ # Strategy 1: Look for image viewers or specific image containers
81
+ image_containers = [
82
+ soup.find('div', class_='viewer-container'),
83
+ soup.find('div', class_='image-viewer'),
84
+ soup.find('div', id='image-container')
85
+ ]
86
+
87
+ # Strategy 2: Find all image tags
88
+ img_tags = soup.find_all('img')
89
+
90
+ # Combine image sources
91
+ for img in img_tags:
92
+ # Get image source
93
+ src = img.get('src')
94
+ if not src:
95
+ continue
96
+
97
+ # Resolve relative URLs
98
+ full_src = urljoin(url, src)
99
+
100
+ # Extract alt text or use filename
101
+ alt = img.get('alt', os.path.basename(urlparse(full_src).path))
102
+
103
+ # Create image dictionary
104
+ image_info = {
105
+ 'url': full_src,
106
+ 'alt': alt,
107
+ 'source_page': url
108
+ }
109
+
110
+ # Try to add metadata if available
111
+ if metadata:
112
+ try:
113
+ # Extract relevant metadata from JSON if possible
114
+ image_info['metadata'] = {
115
+ 'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
116
+ 'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
117
+ 'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
118
+ }
119
+ except Exception as e:
120
+ self.logger.warning(f"Error extracting metadata: {e}")
121
+
122
+ images.append(image_info)
123
+
124
+ return images
125
+
126
+ def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
127
+ """
128
+ Download images to local directory
129
+
130
+ :param images: List of image dictionaries
131
+ :param output_dir: Directory to save images
132
+ :return: List of downloaded file paths
133
+ """
134
+ # Create output directory
135
+ os.makedirs(output_dir, exist_ok=True)
136
+
137
+ downloaded_files = []
138
+
139
+ for i, image in enumerate(images):
140
+ try:
141
+ response = requests.get(image['url'], headers=self.headers)
142
+ response.raise_for_status()
143
+
144
+ # Generate filename
145
+ ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
146
+ filename = os.path.join(output_dir, f'image_{i}{ext}')
147
+
148
+ with open(filename, 'wb') as f:
149
+ f.write(response.content)
150
+
151
+ downloaded_files.append(filename)
152
+ self.logger.info(f"Downloaded: {filename}")
153
+
154
+ except Exception as e:
155
+ self.logger.error(f"Error downloading {image['url']}: {e}")
156
+
157
+ return downloaded_files
158
+
159
+ def search_query(self, query: str, limit: int = 20) -> List[str]:
160
+ """
161
+ Search Digital Commonwealth with a query
162
+
163
+ :param query: Search query
164
+ :param limit: Maximum number of items to return
165
+ :return: List of item IDs
166
+ """
167
+ # Construct search URL
168
+ encoded_query = query.replace(" ", "+")
169
+ url = f"{self.base_url}/search?q={encoded_query}&format=json"
170
+
171
+ # Fetch search results
172
+ response = self.fetch_page(url)
173
+ if not response:
174
+ return []
175
+
176
+ try:
177
+ search_data = response.json()
178
+
179
+ # Extract items
180
+ item_ids = []
181
+
182
+ # Handle different JSON structures
183
+ if "data" in search_data:
184
+ for item in search_data.get("data", []):
185
+ if len(item_ids) < limit:
186
+ item_id = item.get("id")
187
+ if item_id:
188
+ item_ids.append(item_id)
189
+
190
+ return item_ids[:limit]
191
+
192
+ except Exception as e:
193
+ self.logger.error(f"Error processing search data: {e}")
194
+ return []
195
+
196
+ # Example usage
197
+ if __name__ == "__main__":
198
+ scraper = DigitalCommonwealthScraper()
199
+
200
+ # Search for items
201
+ query = "boston historic"
202
+ item_ids = scraper.search_query(query, limit=5)
203
+
204
+ if item_ids:
205
+ print(f"Found {len(item_ids)} items for query: {query}")
206
+
207
+ # Process first item
208
+ if item_ids:
209
+ item_url = f"https://www.digitalcommonwealth.org/search/{item_ids[0]}"
210
+ print(f"Processing item: {item_url}")
211
+
212
+ # Extract images
213
+ images = scraper.extract_images(item_url)
214
+ print(f"Found {len(images)} images")
215
+
216
+ # Download first image if available
217
+ if images:
218
+ downloaded = scraper.download_images([images[0]], "sample_images")
219
+ print(f"Downloaded: {downloaded}")
220
+ else:
221
+ print(f"No items found for query: {query}")