Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,76 +1,53 @@
|
|
1 |
-
import gradio as gr
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
|
|
4 |
|
5 |
def scrape_naver_blog(url):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
try:
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
}
|
12 |
-
response = requests.get(url, headers=headers)
|
13 |
-
|
14 |
-
# Step 2: HTTP 요청 성공 여부 확인
|
15 |
-
print(f"[DEBUG] Step 2: HTTP Response Code: {response.status_code}")
|
16 |
-
if response.status_code != 200:
|
17 |
-
debug_message = f"HTTP 요청 실패. 상태 코드: {response.status_code}"
|
18 |
-
print(debug_message)
|
19 |
-
return debug_message
|
20 |
-
|
21 |
-
# Step 3: BeautifulSoup을 사용하여 HTML 파싱
|
22 |
-
print("[DEBUG] Step 3: Parsing HTML with BeautifulSoup")
|
23 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
24 |
-
|
25 |
-
# Step 4: 제목 크롤링
|
26 |
-
print("[DEBUG] Step 4: Crawling Title")
|
27 |
-
try:
|
28 |
-
title_element = soup.select_one(
|
29 |
-
"body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div"
|
30 |
-
)
|
31 |
-
title = title_element.get_text(strip=True) if title_element else "제목을 찾을 수 없습니다."
|
32 |
-
print(f"[DEBUG] Title: {title}")
|
33 |
-
except Exception as e:
|
34 |
-
debug_message = f"제목 크롤링 중 오류 발생: {e}"
|
35 |
-
print(debug_message)
|
36 |
-
title = debug_message
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
try:
|
41 |
-
content_element = soup.select_one(
|
42 |
-
"body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
|
43 |
-
)
|
44 |
-
content = content_element.get_text(strip=True) if content_element else "내용을 찾을 수 없습니다."
|
45 |
-
print(f"[DEBUG] Content: {content}")
|
46 |
-
except Exception as e:
|
47 |
-
debug_message = f"내용 크롤링 중 오류 발생: {e}"
|
48 |
-
print(debug_message)
|
49 |
-
content = debug_message
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
except
|
56 |
-
|
57 |
-
print(
|
58 |
-
return debug_message
|
59 |
-
|
60 |
-
def gradio_interface(url):
|
61 |
-
print(f"[DEBUG] Gradio Input URL: {url}")
|
62 |
-
result = scrape_naver_blog(url)
|
63 |
-
print(f"[DEBUG] Crawling Result: {result}")
|
64 |
-
return f"제목: {result['제목']}\n내용: {result['내용']}"
|
65 |
|
66 |
-
#
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
)
|
74 |
|
75 |
if __name__ == "__main__":
|
76 |
-
|
|
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
+
import gradio as gr
|
4 |
|
5 |
def scrape_naver_blog(url):
|
6 |
+
# 디버깅: URL 확인
|
7 |
+
print(f"Scraping URL: {url}")
|
8 |
+
|
9 |
+
# 모바일 URL 형태로 변환
|
10 |
+
if not url.startswith("https://m.blog.naver.com"):
|
11 |
+
url = url.replace("https://blog.naver.com", "https://m.blog.naver.com")
|
12 |
+
print(f"Converted to mobile URL: {url}")
|
13 |
+
|
14 |
+
# 요청 보내기
|
15 |
try:
|
16 |
+
response = requests.get(url)
|
17 |
+
response.raise_for_status()
|
18 |
+
except requests.RequestException as e:
|
19 |
+
return f"Error while fetching the page: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
# BeautifulSoup으로 파싱
|
22 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
# 제목 스크래핑
|
25 |
+
try:
|
26 |
+
title = soup.select_one('div.se-fs-.se-ff-').text.strip()
|
27 |
+
print(f"Scraped Title: {title}")
|
28 |
+
except AttributeError:
|
29 |
+
title = "Title not found"
|
30 |
+
print("Failed to scrape the title")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
# 내용 스크래핑
|
33 |
+
try:
|
34 |
+
content_elements = soup.select('div.se-component-content > div.se-text-paragraph > span')
|
35 |
+
content = "\n".join([element.text.strip() for element in content_elements if element.text.strip()])
|
36 |
+
print(f"Scraped Content: {content[:100]}...") # 내용 일부 출력
|
37 |
+
except AttributeError:
|
38 |
+
content = "Content not found"
|
39 |
+
print("Failed to scrape the content")
|
40 |
+
|
41 |
+
return f"제목: {title}\n\n내용: {content}"
|
42 |
+
|
43 |
+
# Gradio 인터페이스 설정
|
44 |
+
interface = gr.Interface(
|
45 |
+
fn=scrape_naver_blog,
|
46 |
+
inputs=gr.Textbox(label="네이버 블로그 URL"),
|
47 |
+
outputs=gr.Textbox(label="스크래핑 결과"),
|
48 |
+
title="네이버 블로그 스크래핑",
|
49 |
+
description="네이버 블로그에서 제목과 내용을 스크래핑합니다. 모바일 URL을 입력하세요."
|
50 |
)
|
51 |
|
52 |
if __name__ == "__main__":
|
53 |
+
interface.launch()
|