gewei20 commited on
Commit
bd255d8
·
verified ·
1 Parent(s): c235b03

Add 3 files

Browse files
Files changed (3) hide show
  1. README.md +6 -4
  2. index.html +415 -19
  3. prompts.txt +1 -0
README.md CHANGED
@@ -1,10 +1,12 @@
1
  ---
2
- title: Crawler Ui
3
- emoji: 🦀
4
- colorFrom: indigo
5
  colorTo: yellow
6
  sdk: static
7
  pinned: false
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: crawler-ui
3
+ emoji: 🐳
4
+ colorFrom: pink
5
  colorTo: yellow
6
  sdk: static
7
  pinned: false
8
+ tags:
9
+ - deepsite
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
index.html CHANGED
@@ -1,19 +1,415 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Python Code Crawler</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
9
+ <style>
10
+ .gradient-bg {
11
+ background: linear-gradient(135deg, #6e8efb, #a777e3);
12
+ }
13
+ .code-block {
14
+ font-family: 'Courier New', monospace;
15
+ background-color: #2d3748;
16
+ color: #f7fafc;
17
+ border-radius: 0.5rem;
18
+ transition: all 0.3s ease;
19
+ }
20
+ .code-block:hover {
21
+ transform: translateY(-2px);
22
+ box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
23
+ }
24
+ .fade-in {
25
+ animation: fadeIn 0.5s ease-in-out;
26
+ }
27
+ @keyframes fadeIn {
28
+ from { opacity: 0; transform: translateY(10px); }
29
+ to { opacity: 1; transform: translateY(0); }
30
+ }
31
+ .progress-bar {
32
+ transition: width 0.3s ease;
33
+ }
34
+ </style>
35
+ </head>
36
+ <body class="min-h-screen bg-gray-100">
37
+ <div class="gradient-bg text-white py-12 px-4 sm:px-6 lg:px-8">
38
+ <div class="max-w-4xl mx-auto text-center">
39
+ <div class="flex justify-center mb-6">
40
+ <div class="bg-white bg-opacity-20 p-4 rounded-full">
41
+ <i class="fas fa-spider text-4xl"></i>
42
+ </div>
43
+ </div>
44
+ <h1 class="text-4xl font-bold mb-4">Python Code Crawler</h1>
45
+ <p class="text-xl opacity-90 mb-8">Discover and extract Python code from websites automatically</p>
46
+
47
+ <div class="bg-white bg-opacity-20 backdrop-blur-sm rounded-xl p-6 shadow-lg">
48
+ <div class="flex flex-col sm:flex-row gap-4">
49
+ <input type="text" id="targetUrl" placeholder="Enter website URL (e.g., https://example.com)"
50
+ class="flex-grow px-4 py-3 rounded-lg bg-white bg-opacity-90 text-gray-800 focus:outline-none focus:ring-2 focus:ring-purple-300">
51
+ <button id="startCrawl" class="px-6 py-3 bg-white text-purple-600 font-semibold rounded-lg hover:bg-opacity-90 transition flex items-center justify-center gap-2">
52
+ <i class="fas fa-play"></i> Start Crawling
53
+ </button>
54
+ </div>
55
+
56
+ <div class="mt-4 text-left">
57
+ <label class="inline-flex items-center">
58
+ <input type="checkbox" id="usePattern" class="form-checkbox h-5 w-5 text-purple-600">
59
+ <span class="ml-2">Use URL pattern (e.g., https://example.com/docs/*)</span>
60
+ </label>
61
+ </div>
62
+ </div>
63
+ </div>
64
+ </div>
65
+
66
+ <div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8 py-12">
67
+ <div id="resultsContainer" class="space-y-8">
68
+ <!-- Results will be added here dynamically -->
69
+ </div>
70
+
71
+ <div id="statusPanel" class="hidden bg-white rounded-xl shadow-md p-6 mt-8">
72
+ <div class="flex justify-between items-center mb-4">
73
+ <h3 class="text-xl font-semibold text-gray-800">Crawling Progress</h3>
74
+ <button id="stopCrawl" class="px-4 py-2 bg-red-500 text-white rounded-lg hover:bg-red-600 transition flex items-center gap-2">
75
+ <i class="fas fa-stop"></i> Stop
76
+ </button>
77
+ </div>
78
+
79
+ <div class="mb-4">
80
+ <div class="flex justify-between text-sm text-gray-600 mb-1">
81
+ <span>Visited: <span id="visitedCount">0</span> pages</span>
82
+ <span>Found: <span id="codeBlocksCount">0</span> code blocks</span>
83
+ </div>
84
+ <div class="w-full bg-gray-200 rounded-full h-2.5">
85
+ <div id="progressBar" class="progress-bar bg-purple-600 h-2.5 rounded-full" style="width: 0%"></div>
86
+ </div>
87
+ </div>
88
+
89
+ <div class="bg-gray-50 rounded-lg p-4 max-h-60 overflow-y-auto">
90
+ <h4 class="font-medium text-gray-700 mb-2">Activity Log:</h4>
91
+ <div id="activityLog" class="space-y-2 text-sm">
92
+ <!-- Log messages will appear here -->
93
+ </div>
94
+ </div>
95
+ </div>
96
+ </div>
97
+
98
+ <div id="loadingOverlay" class="fixed inset-0 bg-black bg-opacity-50 flex items-center justify-center z-50 hidden">
99
+ <div class="bg-white rounded-xl p-8 max-w-md w-full mx-4 text-center">
100
+ <div class="animate-spin rounded-full h-16 w-16 border-t-2 border-b-2 border-purple-500 mx-auto mb-4"></div>
101
+ <h3 class="text-xl font-semibold mb-2">Processing Website</h3>
102
+ <p class="text-gray-600 mb-4" id="loadingMessage">Initializing crawler...</p>
103
+ <div class="w-full bg-gray-200 rounded-full h-2">
104
+ <div id="loadingProgress" class="bg-purple-600 h-2 rounded-full" style="width: 0%"></div>
105
+ </div>
106
+ </div>
107
+ </div>
108
+
109
+ <script>
110
+ document.addEventListener('DOMContentLoaded', function() {
111
+ const startCrawlBtn = document.getElementById('startCrawl');
112
+ const stopCrawlBtn = document.getElementById('stopCrawl');
113
+ const targetUrlInput = document.getElementById('targetUrl');
114
+ const usePatternCheckbox = document.getElementById('usePattern');
115
+ const resultsContainer = document.getElementById('resultsContainer');
116
+ const statusPanel = document.getElementById('statusPanel');
117
+ const loadingOverlay = document.getElementById('loadingOverlay');
118
+ const loadingMessage = document.getElementById('loadingMessage');
119
+ const loadingProgress = document.getElementById('loadingProgress');
120
+ const visitedCount = document.getElementById('visitedCount');
121
+ const codeBlocksCount = document.getElementById('codeBlocksCount');
122
+ const progressBar = document.getElementById('progressBar');
123
+ const activityLog = document.getElementById('activityLog');
124
+
125
+ let isCrawling = false;
126
+ let visitedPages = 0;
127
+ let foundCodeBlocks = 0;
128
+
129
+ // Simulate crawling (in a real app, this would connect to a backend)
130
+ startCrawlBtn.addEventListener('click', async function() {
131
+ const url = targetUrlInput.value.trim();
132
+ if (!url) {
133
+ showAlert('Please enter a valid URL', 'error');
134
+ return;
135
+ }
136
+
137
+ isCrawling = true;
138
+ visitedPages = 0;
139
+ foundCodeBlocks = 0;
140
+ resultsContainer.innerHTML = '';
141
+ activityLog.innerHTML = '';
142
+ updateStatus();
143
+
144
+ // Show loading overlay
145
+ loadingOverlay.classList.remove('hidden');
146
+ statusPanel.classList.remove('hidden');
147
+
148
+ // Simulate initialization
149
+ loadingMessage.textContent = 'Initializing crawler...';
150
+ updateLoadingProgress(10);
151
+ await delay(800);
152
+
153
+ // Validate URL
154
+ loadingMessage.textContent = 'Validating URL...';
155
+ updateLoadingProgress(20);
156
+ await delay(600);
157
+
158
+ if (!isValidUrl(url)) {
159
+ showAlert('Please enter a valid URL starting with http:// or https://', 'error');
160
+ loadingOverlay.classList.add('hidden');
161
+ isCrawling = false;
162
+ return;
163
+ }
164
+
165
+ // Start crawling simulation
166
+ loadingMessage.textContent = 'Starting crawl process...';
167
+ updateLoadingProgress(30);
168
+ await delay(500);
169
+
170
+ // Hide loading overlay after some time
171
+ setTimeout(() => {
172
+ loadingOverlay.classList.add('hidden');
173
+ }, 2000);
174
+
175
+ // Simulate crawling pages
176
+ simulateCrawling(url);
177
+ });
178
+
179
+ stopCrawlBtn.addEventListener('click', function() {
180
+ if (isCrawling) {
181
+ isCrawling = false;
182
+ addLogMessage('🚨 Crawling stopped by user. Saving collected data...');
183
+ setTimeout(() => {
184
+ showAlert('Crawling stopped. Collected data has been saved.', 'info');
185
+ }, 1000);
186
+ }
187
+ });
188
+
189
+ function simulateCrawling(baseUrl) {
190
+ // This is just a simulation - in a real app, this would be API calls to a backend
191
+ const pagesToSimulate = 8;
192
+ const codeBlocksPerPage = [0, 1, 2, 0, 3, 1, 0, 2]; // Varying number of code blocks
193
+
194
+ let currentPage = 0;
195
+
196
+ const crawlInterval = setInterval(() => {
197
+ if (!isCrawling || currentPage >= pagesToSimulate) {
198
+ clearInterval(crawlInterval);
199
+ if (currentPage >= pagesToSimulate) {
200
+ addLogMessage('✅ Crawling completed successfully!');
201
+ showAlert('Crawling completed! Results are displayed below.', 'success');
202
+ }
203
+ return;
204
+ }
205
+
206
+ currentPage++;
207
+ visitedPages++;
208
+
209
+ // Simulate finding code blocks
210
+ const codeBlocksFound = codeBlocksPerPage[currentPage % codeBlocksPerPage.length];
211
+ foundCodeBlocks += codeBlocksFound;
212
+
213
+ // Update UI
214
+ updateStatus();
215
+ addLogMessage(`🌐 Crawling page ${currentPage}/${pagesToSimulate}: ${baseUrl}/page${currentPage}`);
216
+
217
+ if (codeBlocksFound > 0) {
218
+ addLogMessage(`✅ Found ${codeBlocksFound} Python code blocks`);
219
+
220
+ // Add simulated results
221
+ for (let i = 0; i < codeBlocksFound; i++) {
222
+ const result = {
223
+ url: `${baseUrl}/page${currentPage}#code-${i}`,
224
+ code: generateRandomPythonCode(),
225
+ context: `Example context for code block ${i+1}`
226
+ };
227
+ addResultToUI(result);
228
+ }
229
+ }
230
+
231
+ // Update progress
232
+ const progress = Math.min(100, (currentPage / pagesToSimulate) * 100);
233
+ progressBar.style.width = `${progress}%`;
234
+
235
+ }, 1500);
236
+ }
237
+
238
+ function addResultToUI(result) {
239
+ const resultElement = document.createElement('div');
240
+ resultElement.className = 'fade-in bg-white rounded-xl shadow-md overflow-hidden';
241
+ resultElement.innerHTML = `
242
+ <div class="p-6">
243
+ <div class="flex justify-between items-start mb-4">
244
+ <div>
245
+ <h3 class="text-lg font-semibold text-gray-800 mb-1">${result.url}</h3>
246
+ <p class="text-sm text-gray-500">${result.context}</p>
247
+ </div>
248
+ <button class="copy-code-btn px-3 py-1 bg-gray-100 text-gray-700 rounded-lg text-sm hover:bg-gray-200 transition flex items-center gap-1">
249
+ <i class="far fa-copy"></i> Copy
250
+ </button>
251
+ </div>
252
+ <div class="code-block p-4 overflow-x-auto">
253
+ <pre class="text-sm">${escapeHtml(result.code)}</pre>
254
+ </div>
255
+ </div>
256
+ `;
257
+
258
+ resultsContainer.appendChild(resultElement);
259
+
260
+ // Add copy functionality
261
+ const copyBtn = resultElement.querySelector('.copy-code-btn');
262
+ copyBtn.addEventListener('click', function() {
263
+ navigator.clipboard.writeText(result.code).then(() => {
264
+ const originalText = copyBtn.innerHTML;
265
+ copyBtn.innerHTML = '<i class="fas fa-check"></i> Copied!';
266
+ setTimeout(() => {
267
+ copyBtn.innerHTML = originalText;
268
+ }, 2000);
269
+ });
270
+ });
271
+ }
272
+
273
+ function addLogMessage(message) {
274
+ const now = new Date();
275
+ const timeString = now.toLocaleTimeString();
276
+ const logEntry = document.createElement('div');
277
+ logEntry.className = 'flex items-start gap-2';
278
+ logEntry.innerHTML = `
279
+ <span class="text-gray-500 text-xs mt-0.5">${timeString}</span>
280
+ <span class="flex-1">${message}</span>
281
+ `;
282
+ activityLog.appendChild(logEntry);
283
+ activityLog.scrollTop = activityLog.scrollHeight;
284
+ }
285
+
286
+ function updateStatus() {
287
+ visitedCount.textContent = visitedPages;
288
+ codeBlocksCount.textContent = foundCodeBlocks;
289
+ }
290
+
291
+ function updateLoadingProgress(percent) {
292
+ loadingProgress.style.width = `${percent}%`;
293
+ }
294
+
295
+ function showAlert(message, type) {
296
+ const alertTypes = {
297
+ error: 'bg-red-100 border-red-400 text-red-700',
298
+ success: 'bg-green-100 border-green-400 text-green-700',
299
+ info: 'bg-blue-100 border-blue-400 text-blue-700'
300
+ };
301
+
302
+ const alertDiv = document.createElement('div');
303
+ alertDiv.className = `fixed top-4 right-4 border-l-4 p-4 rounded shadow-lg ${alertTypes[type] || alertTypes.info} max-w-md z-50 fade-in`;
304
+ alertDiv.innerHTML = `
305
+ <div class="flex items-center">
306
+ <div class="flex-shrink-0">
307
+ ${type === 'error' ? '<i class="fas fa-exclamation-circle"></i>' : ''}
308
+ ${type === 'success' ? '<i class="fas fa-check-circle"></i>' : ''}
309
+ ${type === 'info' ? '<i class="fas fa-info-circle"></i>' : ''}
310
+ </div>
311
+ <div class="ml-3">
312
+ <p class="text-sm">${message}</p>
313
+ </div>
314
+ <button class="ml-auto -mx-1.5 -my-1.5 rounded-lg p-1.5 inline-flex h-8 w-8 focus:outline-none ${type === 'error' ? 'bg-red-100 text-red-500 hover:bg-red-200' : ''} ${type === 'success' ? 'bg-green-100 text-green-500 hover:bg-green-200' : ''} ${type === 'info' ? 'bg-blue-100 text-blue-500 hover:bg-blue-200' : ''}">
315
+ <span class="sr-only">Close</span>
316
+ <i class="fas fa-times"></i>
317
+ </button>
318
+ </div>
319
+ `;
320
+
321
+ document.body.appendChild(alertDiv);
322
+
323
+ // Auto-remove after 5 seconds
324
+ setTimeout(() => {
325
+ alertDiv.classList.remove('fade-in');
326
+ alertDiv.classList.add('opacity-0', 'transition-opacity', 'duration-300');
327
+ setTimeout(() => {
328
+ alertDiv.remove();
329
+ }, 300);
330
+ }, 5000);
331
+
332
+ // Manual close
333
+ alertDiv.querySelector('button').addEventListener('click', function() {
334
+ alertDiv.remove();
335
+ });
336
+ }
337
+
338
+ // Helper functions
339
+ function isValidUrl(url) {
340
+ try {
341
+ new URL(url);
342
+ return true;
343
+ } catch (e) {
344
+ return false;
345
+ }
346
+ }
347
+
348
+ function delay(ms) {
349
+ return new Promise(resolve => setTimeout(resolve, ms));
350
+ }
351
+
352
+ function escapeHtml(unsafe) {
353
+ return unsafe
354
+ .replace(/&/g, "&amp;")
355
+ .replace(/</g, "&lt;")
356
+ .replace(/>/g, "&gt;")
357
+ .replace(/"/g, "&quot;")
358
+ .replace(/'/g, "&#039;");
359
+ }
360
+
361
+ function generateRandomPythonCode() {
362
+ const codeSnippets = [
363
+ `def calculate_factorial(n):
364
+ if n == 0:
365
+ return 1
366
+ else:
367
+ return n * calculate_factorial(n-1)
368
+
369
+ print(calculate_factorial(5))`,
370
+ `class Animal:
371
+ def __init__(self, name, species):
372
+ self.name = name
373
+ self.species = species
374
+
375
+ def speak(self):
376
+ return "Some generic sound"
377
+
378
+ class Dog(Animal):
379
+ def speak(self):
380
+ return "Woof!"`,
381
+ `import requests
382
+
383
+ def fetch_data(url):
384
+ try:
385
+ response = requests.get(url, timeout=5)
386
+ response.raise_for_status()
387
+ return response.json()
388
+ except requests.exceptions.RequestException as e:
389
+ print(f"Error fetching data: {e}")
390
+ return None`,
391
+ `async def process_data(data):
392
+ results = []
393
+ async with aiohttp.ClientSession() as session:
394
+ tasks = [fetch_item(session, item) for item in data]
395
+ results = await asyncio.gather(*tasks)
396
+ return [r for r in results if r is not None]`,
397
+ `@app.route('/api/users', methods=['GET'])
398
+ def get_users():
399
+ users = User.query.all()
400
+ return jsonify([user.to_dict() for user in users])`,
401
+ `def fibonacci(n):
402
+ a, b = 0, 1
403
+ for _ in range(n):
404
+ yield a
405
+ a, b = b, a + b
406
+
407
+ print(list(fibonacci(10)))`
408
+ ];
409
+
410
+ return codeSnippets[Math.floor(Math.random() * codeSnippets.length)];
411
+ }
412
+ });
413
+ </script>
414
+ <p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - 🧬 <a href="https://enzostvs-deepsite.hf.space?remix=gewei20/crawler-ui" style="color: #fff;text-decoration: underline;" target="_blank" >Remix</a></p></body>
415
+ </html>
prompts.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ import os import json import asyncio import random import re from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode import nest_asyncio import signal nest_asyncio.apply() # 常量 OUTPUT_FILE_BASE = "python_code_results.json" MAX_CONCURRENT = 5 REQUEST_DELAY = (0.5, 1.5) # CrawlerState 类 class CrawlerState: def __init__(self): self.visited = set() self.queue = asyncio.Queue() self.sem = asyncio.Semaphore(MAX_CONCURRENT) self.results = [] self.active_tasks = 0 self.lock = asyncio.Lock() self.interrupted = False # Add an interruption flag def handle_interrupt(self): self.interrupted = True # Set the flag print("\n🚨 爬取中断,正在保存已爬取的内容...") # 获取唯一文件名 def get_unique_filename(): counter = 1 while True: filename = f"{os.path.splitext(OUTPUT_FILE_BASE)[0]}_{counter}.json" if not os.path.exists(filename): return filename counter += 1. # 检查是否为内部链接 def is_internal_link(base_url, href): base_parts = urlparse(base_url) href_parts = urlparse(href) if not href_parts.netloc: return True return href_parts.netloc == base_parts.netloc # 抓取页面 async def fetch_page(url, crawler, sem): async with sem: try: await asyncio.sleep(random.uniform(*REQUEST_DELAY)) run_cfg = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, page_timeout=30000 # 30秒超时(单位:毫秒) ) result = await crawler.arun(url, config=run_cfg) return result.cleaned_html if result.success else None except (asyncio.TimeoutError, ConnectionError) as e: print(f"请求异常 {url}: {str(e)}") return None # 提取链接 def extract_links(html, base_url, allowed_patterns=None): soup = BeautifulSoup(html, 'html.parser') links = set() for tag in soup.find_all(['a', 'link'], href=True): href = tag['href'] full_url = urljoin(base_url, href) if is_internal_link(base_url, full_url): if allowed_patterns: for pattern in allowed_patterns: if re.match(pattern, full_url): links.add(full_url) break # Only add if it matches any pattern else: links.add(full_url) return links # 检测 Python 代码 def detect_python_code(text): keywords = {'def', 'class', 'import', 'from', 'try', 'except', 'with', 'async'} return any(kw in line for line in text.split('\n') for kw in keywords) # 处理 URL async def process_url(url, crawler, state, allowed_patterns=None): async with state.sem: async with state.lock: if url in state.visited or state.interrupted: # Check interrupt flag return state.visited.add(url) state.active_tasks += 1 try: print(f"正在爬取: {url}") html = await fetch_page(url, crawler, state.sem) if not html: return code_blocks = extract_code_blocks(html) if code_blocks: async with state.lock: state.results.append({ 'url': url, 'code_blocks': code_blocks }) print(f"✅ 发现 {len(code_blocks)} 个代码块") new_links = extract_links(html, url, allowed_patterns) for link in new_links: async with state.lock: if link not in state.visited and not state.interrupted: # Check interrupt flag await state.queue.put(link) except Exception as e: print(f"处理 {url} 发生异常: {e}") finally: async with state.lock: state.active_tasks -= 1 # 提取代码块 def extract_code_blocks(html): soup = BeautifulSoup(html, 'html.parser') blocks = [] for pre in soup.find_all('pre'): code = pre.get_text().strip() if detect_python_code(code): blocks.append({ 'code': code, 'context': get_code_context(pre) }) for code_tag in soup.find_all('code'): code = code_tag.get_text().strip() if code and detect_python_code(code): blocks.append({ 'code': code, 'context': get_code_context(code_tag) }) return blocks # 获取代码上下文 def get_code_context(element): context = {} for h in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: header = element.find_previous(h) if header: context['header'] = header.get_text().strip() break parent = element.find_parent(['div', 'section', 'article']) if parent and 'id' in parent.attrs: context['container_id'] = parent['id'] return context # 工作线程 async def worker(crawler, state): while True: try: url = await asyncio.wait_for(state.queue.get(), timeout=5) await process_url(url, crawler, state) state.queue.task_done() except (asyncio.TimeoutError, asyncio.CancelledError): if state.active_tasks == 0 and state.queue.empty(): break # 爬取网站 async def crawl_website(base_url, allowed_patterns=None): state = CrawlerState() signal.signal(signal.SIGINT, lambda signum, frame: state.handle_interrupt()) # 注册 Ctrl+C 信号处理程序 await state.queue.put(base_url) # 移除 timeout 参数 browser_cfg = BrowserConfig(headless=True) try: async with AsyncWebCrawler(config=browser_cfg) as crawler: workers = [asyncio.create_task(worker(crawler, state)) for _ in range(MAX_CONCURRENT)] await state.queue.join() for task in workers: task.cancel() await asyncio.gather(*workers, return_exceptions=True) except asyncio.CancelledError: print("爬取任务被取消") except Exception as e: print(f"爬取过程中发生异常: {e}") finally: output_file = get_unique_filename() with open(output_file, 'w', encoding='utf-8') as f: json.dump(state.results, f, indent=2, ensure_ascii=False) print(f"\n✅ 爬取完成!访问页面数:{len(state.visited)}") print(f"📁 结果保存至:{os.path.abspath(output_file)}") # 主函数 if __name__ == "__main__": target_url = input("请输入目标网站URL: ") # 处理通配符 if "*/" in target_url: base_url = target_url.split("*")[0] allowed_patterns = [re.escape(target_url.replace("*", ".*"))] asyncio.run(crawl_website(base_url, allowed_patterns)) else: asyncio.run(crawl_website(target_url))