Spaces:

AIRider
/

blogcr111111

Sleeping

App Files Files Community

AIRider commited on Jan 13

Commit

80d51fc

verified ·

1 Parent(s): 9ce0218

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -65

app.py CHANGED Viewed

@@ -1,76 +1,53 @@
-import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 def scrape_naver_blog(url):
     try:
-        # Step 1: User-Agent 설정
-        print("[DEBUG] Step 1: Setting User-Agent")
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
-        }
-        response = requests.get(url, headers=headers)
-        # Step 2: HTTP 요청 성공 여부 확인
-        print(f"[DEBUG] Step 2: HTTP Response Code: {response.status_code}")
-        if response.status_code != 200:
-            debug_message = f"HTTP 요청 실패. 상태 코드: {response.status_code}"
-            print(debug_message)
-            return debug_message
-        # Step 3: BeautifulSoup을 사용하여 HTML 파싱
-        print("[DEBUG] Step 3: Parsing HTML with BeautifulSoup")
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Step 4: 제목 크롤링
-        print("[DEBUG] Step 4: Crawling Title")
-        try:
-            title_element = soup.select_one(
-                "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div"
-            )
-            title = title_element.get_text(strip=True) if title_element else "제목을 찾을 수 없습니다."
-            print(f"[DEBUG] Title: {title}")
-        except Exception as e:
-            debug_message = f"제목 크롤링 중 오류 발생: {e}"
-            print(debug_message)
-            title = debug_message
-        # Step 5: 내용 크롤링
-        print("[DEBUG] Step 5: Crawling Content")
-        try:
-            content_element = soup.select_one(
-                "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
-            )
-            content = content_element.get_text(strip=True) if content_element else "내용을 찾을 수 없습니다."
-            print(f"[DEBUG] Content: {content}")
-        except Exception as e:
-            debug_message = f"내용 크롤링 중 오류 발생: {e}"
-            print(debug_message)
-            content = debug_message
-        # Step 6: 결과 출력
-        print("[DEBUG] Step 6: Returning Results")
-        return {"제목": title, "내용": content}
-    except Exception as e:
-        debug_message = f"전체 크롤링 중 오류 발생: {e}"
-        print(debug_message)
-        return debug_message
-def gradio_interface(url):
-    print(f"[DEBUG] Gradio Input URL: {url}")
-    result = scrape_naver_blog(url)
-    print(f"[DEBUG] Crawling Result: {result}")
-    return f"제목: {result['제목']}\n내용: {result['내용']}"
-# Gradio 인터페이스 구성
-iface = gr.Interface(
-    fn=gradio_interface,
-    inputs=gr.Textbox(label="네이버 블로그 URL 입력"),
-    outputs=gr.Textbox(label="크롤링 결과"),
-    title="네이버 블로그 크롤러",
-    description="네이버 블로그의 제목과 내용을 크롤링하여 출력합니다."
 )
 if __name__ == "__main__":
-    iface.launch()

 import requests
 from bs4 import BeautifulSoup
+import gradio as gr
 def scrape_naver_blog(url):
+    # 디버깅: URL 확인
+    print(f"Scraping URL: {url}")
+    # 모바일 URL 형태로 변환
+    if not url.startswith("https://m.blog.naver.com"):
+        url = url.replace("https://blog.naver.com", "https://m.blog.naver.com")
+        print(f"Converted to mobile URL: {url}")
+    # 요청 보내기
     try:
+        response = requests.get(url)
+        response.raise_for_status()
+    except requests.RequestException as e:
+        return f"Error while fetching the page: {e}"
+    # BeautifulSoup으로 파싱
+    soup = BeautifulSoup(response.text, 'html.parser')
+    # 제목 스크래핑
+    try:
+        title = soup.select_one('div.se-fs-.se-ff-').text.strip()
+        print(f"Scraped Title: {title}")
+    except AttributeError:
+        title = "Title not found"
+        print("Failed to scrape the title")
+    # 내용 스크래핑
+    try:
+        content_elements = soup.select('div.se-component-content > div.se-text-paragraph > span')
+        content = "\n".join([element.text.strip() for element in content_elements if element.text.strip()])
+        print(f"Scraped Content: {content[:100]}...")  # 내용 일부 출력
+    except AttributeError:
+        content = "Content not found"
+        print("Failed to scrape the content")
+    return f"제목: {title}\n\n내용: {content}"
+# Gradio 인터페이스 설정
+interface = gr.Interface(
+    fn=scrape_naver_blog,
+    inputs=gr.Textbox(label="네이버 블로그 URL"),
+    outputs=gr.Textbox(label="스크래핑 결과"),
+    title="네이버 블로그 스크래핑",
+    description="네이버 블로그에서 제목과 내용을 스크래핑합니다. 모바일 URL을 입력하세요."
 )
 if __name__ == "__main__":
+    interface.launch()