AIRider commited on
Commit
80d51fc
·
verified ·
1 Parent(s): 9ce0218

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -65
app.py CHANGED
@@ -1,76 +1,53 @@
1
- import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
 
5
  def scrape_naver_blog(url):
 
 
 
 
 
 
 
 
 
6
  try:
7
- # Step 1: User-Agent 설정
8
- print("[DEBUG] Step 1: Setting User-Agent")
9
- headers = {
10
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
11
- }
12
- response = requests.get(url, headers=headers)
13
-
14
- # Step 2: HTTP 요청 성공 여부 확인
15
- print(f"[DEBUG] Step 2: HTTP Response Code: {response.status_code}")
16
- if response.status_code != 200:
17
- debug_message = f"HTTP 요청 실패. 상태 코드: {response.status_code}"
18
- print(debug_message)
19
- return debug_message
20
-
21
- # Step 3: BeautifulSoup을 사용하여 HTML 파싱
22
- print("[DEBUG] Step 3: Parsing HTML with BeautifulSoup")
23
- soup = BeautifulSoup(response.text, 'html.parser')
24
-
25
- # Step 4: 제목 크롤링
26
- print("[DEBUG] Step 4: Crawling Title")
27
- try:
28
- title_element = soup.select_one(
29
- "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div"
30
- )
31
- title = title_element.get_text(strip=True) if title_element else "제목을 찾을 수 없습니다."
32
- print(f"[DEBUG] Title: {title}")
33
- except Exception as e:
34
- debug_message = f"제목 크롤링 중 오류 발생: {e}"
35
- print(debug_message)
36
- title = debug_message
37
 
38
- # Step 5: 내용 크롤링
39
- print("[DEBUG] Step 5: Crawling Content")
40
- try:
41
- content_element = soup.select_one(
42
- "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
43
- )
44
- content = content_element.get_text(strip=True) if content_element else "내용을 찾을 수 없습니다."
45
- print(f"[DEBUG] Content: {content}")
46
- except Exception as e:
47
- debug_message = f"내용 크롤링 중 오류 발생: {e}"
48
- print(debug_message)
49
- content = debug_message
50
 
51
- # Step 6: 결과 출력
52
- print("[DEBUG] Step 6: Returning Results")
53
- return {"제목": title, "내용": content}
54
-
55
- except Exception as e:
56
- debug_message = f"전체 크롤링 중 오류 발생: {e}"
57
- print(debug_message)
58
- return debug_message
59
-
60
- def gradio_interface(url):
61
- print(f"[DEBUG] Gradio Input URL: {url}")
62
- result = scrape_naver_blog(url)
63
- print(f"[DEBUG] Crawling Result: {result}")
64
- return f"제목: {result['제목']}\n내용: {result['내용']}"
65
 
66
- # Gradio 인터페이스 구성
67
- iface = gr.Interface(
68
- fn=gradio_interface,
69
- inputs=gr.Textbox(label="네이버 블로그 URL 입력"),
70
- outputs=gr.Textbox(label="크롤링 결과"),
71
- title="네이버 블로그 크롤러",
72
- description="네이버 블로그의 제목과 내용을 크롤링하여 출력합니다."
 
 
 
 
 
 
 
 
 
 
 
73
  )
74
 
75
  if __name__ == "__main__":
76
- iface.launch()
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ import gradio as gr
4
 
5
  def scrape_naver_blog(url):
6
+ # 디버깅: URL 확인
7
+ print(f"Scraping URL: {url}")
8
+
9
+ # 모바일 URL 형태로 변환
10
+ if not url.startswith("https://m.blog.naver.com"):
11
+ url = url.replace("https://blog.naver.com", "https://m.blog.naver.com")
12
+ print(f"Converted to mobile URL: {url}")
13
+
14
+ # 요청 보내기
15
  try:
16
+ response = requests.get(url)
17
+ response.raise_for_status()
18
+ except requests.RequestException as e:
19
+ return f"Error while fetching the page: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # BeautifulSoup으로 파싱
22
+ soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # 제목 스크래핑
25
+ try:
26
+ title = soup.select_one('div.se-fs-.se-ff-').text.strip()
27
+ print(f"Scraped Title: {title}")
28
+ except AttributeError:
29
+ title = "Title not found"
30
+ print("Failed to scrape the title")
 
 
 
 
 
 
 
31
 
32
+ # 내용 스크래핑
33
+ try:
34
+ content_elements = soup.select('div.se-component-content > div.se-text-paragraph > span')
35
+ content = "\n".join([element.text.strip() for element in content_elements if element.text.strip()])
36
+ print(f"Scraped Content: {content[:100]}...") # 내용 일부 출력
37
+ except AttributeError:
38
+ content = "Content not found"
39
+ print("Failed to scrape the content")
40
+
41
+ return f"제목: {title}\n\n내용: {content}"
42
+
43
+ # Gradio 인터페이스 설정
44
+ interface = gr.Interface(
45
+ fn=scrape_naver_blog,
46
+ inputs=gr.Textbox(label="네이버 블로그 URL"),
47
+ outputs=gr.Textbox(label="스크래핑 결과"),
48
+ title="네이버 블로그 스크래핑",
49
+ description="네이버 블로그에서 제목과 내용을 스크래핑합니다. 모바일 URL을 입력하세요."
50
  )
51
 
52
  if __name__ == "__main__":
53
+ interface.launch()