ainews-db / app.py
ginipick's picture
Update app.py
c8695fd verified
raw
history blame
40.6 kB
import gradio as gr
import requests
import json
import os
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from openai import OpenAI
from bs4 import BeautifulSoup
ACCESS_TOKEN = os.getenv("HF_TOKEN")
if not ACCESS_TOKEN:
raise ValueError("HF_TOKEN environment variable is not set")
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1/",
api_key=ACCESS_TOKEN,
)
MAX_COUNTRY_RESULTS = 100 # 국가별 최대 결과 수
MAX_GLOBAL_RESULTS = 1000 # 전세계 최대 결과 수
def create_article_components(max_results):
article_components = []
for i in range(max_results):
with gr.Group(visible=False) as article_group:
title = gr.Markdown()
image = gr.Image(width=200, height=150)
snippet = gr.Markdown()
info = gr.Markdown()
article_components.append({
'group': article_group,
'title': title,
'image': image,
'snippet': snippet,
'info': info,
'index': i,
})
return article_components
API_KEY = os.getenv("SERPHOUSE_API_KEY")
# 국가별 언어 코드 매핑
COUNTRY_LANGUAGES = {
"United States": "en",
"United Kingdom": "en",
"Taiwan": "zh-TW",
"Canada": "en",
"Australia": "en",
"Germany": "de",
"France": "fr",
"Japan": "ja",
"China": "zh",
"India": "hi",
"Brazil": "pt",
"Mexico": "es",
"Russia": "ru",
"Italy": "it",
"Spain": "es",
"Netherlands": "nl",
"Singapore": "en",
"Hong Kong": "zh-HK",
"Indonesia": "id",
"Malaysia": "ms",
"Philippines": "tl",
"Thailand": "th",
"Vietnam": "vi",
"Belgium": "nl",
"Denmark": "da",
"Finland": "fi",
"Ireland": "en",
"Norway": "no",
"Poland": "pl",
"Sweden": "sv",
"Switzerland": "de",
"Austria": "de",
"Czech Republic": "cs",
"Greece": "el",
"Hungary": "hu",
"Portugal": "pt",
"Romania": "ro",
"Turkey": "tr",
"Israel": "he",
"Saudi Arabia": "ar",
"United Arab Emirates": "ar",
"South Africa": "en",
"Argentina": "es",
"Chile": "es",
"Colombia": "es",
"Peru": "es",
"Venezuela": "es",
"New Zealand": "en",
"Bangladesh": "bn",
"Pakistan": "ur",
"Egypt": "ar",
"Morocco": "ar",
"Nigeria": "en",
"Kenya": "sw",
"Ukraine": "uk",
"Croatia": "hr",
"Slovakia": "sk",
"Bulgaria": "bg",
"Serbia": "sr",
"Estonia": "et",
"Latvia": "lv",
"Lithuania": "lt",
"Slovenia": "sl",
"Luxembourg": "fr",
"Malta": "mt",
"Cyprus": "el",
"Iceland": "is"
}
COUNTRY_LOCATIONS = {
"United States": "United States",
"United Kingdom": "United Kingdom",
"Taiwan": "Taiwan",
"Canada": "Canada",
"Australia": "Australia",
"Germany": "Germany",
"France": "France",
"Japan": "Japan",
"China": "China",
"India": "India",
"Brazil": "Brazil",
"Mexico": "Mexico",
"Russia": "Russia",
"Italy": "Italy",
"Spain": "Spain",
"Netherlands": "Netherlands",
"Singapore": "Singapore",
"Hong Kong": "Hong Kong",
"Indonesia": "Indonesia",
"Malaysia": "Malaysia",
"Philippines": "Philippines",
"Thailand": "Thailand",
"Vietnam": "Vietnam",
"Belgium": "Belgium",
"Denmark": "Denmark",
"Finland": "Finland",
"Ireland": "Ireland",
"Norway": "Norway",
"Poland": "Poland",
"Sweden": "Sweden",
"Switzerland": "Switzerland",
"Austria": "Austria",
"Czech Republic": "Czech Republic",
"Greece": "Greece",
"Hungary": "Hungary",
"Portugal": "Portugal",
"Romania": "Romania",
"Turkey": "Turkey",
"Israel": "Israel",
"Saudi Arabia": "Saudi Arabia",
"United Arab Emirates": "United Arab Emirates",
"South Africa": "South Africa",
"Argentina": "Argentina",
"Chile": "Chile",
"Colombia": "Colombia",
"Peru": "Peru",
"Venezuela": "Venezuela",
"New Zealand": "New Zealand",
"Bangladesh": "Bangladesh",
"Pakistan": "Pakistan",
"Egypt": "Egypt",
"Morocco": "Morocco",
"Nigeria": "Nigeria",
"Kenya": "Kenya",
"Ukraine": "Ukraine",
"Croatia": "Croatia",
"Slovakia": "Slovakia",
"Bulgaria": "Bulgaria",
"Serbia": "Serbia",
"Estonia": "Estonia",
"Latvia": "Latvia",
"Lithuania": "Lithuania",
"Slovenia": "Slovenia",
"Luxembourg": "Luxembourg",
"Malta": "Malta",
"Cyprus": "Cyprus",
"Iceland": "Iceland"
}
# 지역 정의
# 동아시아 지역
COUNTRY_LANGUAGES_EAST_ASIA = {
"Taiwan": "zh-TW",
"Japan": "ja",
"China": "zh",
"Hong Kong": "zh-HK"
}
COUNTRY_LOCATIONS_EAST_ASIA = {
"Taiwan": "Taiwan",
"Japan": "Japan",
"China": "China",
"Hong Kong": "Hong Kong"
}
# 동남아시아/오세아니아 지역
COUNTRY_LANGUAGES_SOUTHEAST_ASIA_OCEANIA = {
"Indonesia": "id",
"Malaysia": "ms",
"Philippines": "tl",
"Thailand": "th",
"Vietnam": "vi",
"Singapore": "en",
"Papua New Guinea": "en",
"Australia": "en",
"New Zealand": "en"
}
COUNTRY_LOCATIONS_SOUTHEAST_ASIA_OCEANIA = {
"Indonesia": "Indonesia",
"Malaysia": "Malaysia",
"Philippines": "Philippines",
"Thailand": "Thailand",
"Vietnam": "Vietnam",
"Singapore": "Singapore",
"Papua New Guinea": "Papua New Guinea",
"Australia": "Australia",
"New Zealand": "New Zealand"
}
# 동유럽 지역
COUNTRY_LANGUAGES_EAST_EUROPE = {
"Poland": "pl",
"Czech Republic": "cs",
"Greece": "el",
"Hungary": "hu",
"Romania": "ro",
"Ukraine": "uk",
"Croatia": "hr",
"Slovakia": "sk",
"Bulgaria": "bg",
"Serbia": "sr",
"Estonia": "et",
"Latvia": "lv",
"Lithuania": "lt",
"Slovenia": "sl",
"Malta": "mt",
"Cyprus": "el",
"Iceland": "is",
"Russia": "ru"
}
COUNTRY_LOCATIONS_EAST_EUROPE = {
"Poland": "Poland",
"Czech Republic": "Czech Republic",
"Greece": "Greece",
"Hungary": "Hungary",
"Romania": "Romania",
"Ukraine": "Ukraine",
"Croatia": "Croatia",
"Slovakia": "Slovakia",
"Bulgaria": "Bulgaria",
"Serbia": "Serbia",
"Estonia": "Estonia",
"Latvia": "Latvia",
"Lithuania": "Lithuania",
"Slovenia": "Slovenia",
"Malta": "Malta",
"Cyprus": "Cyprus",
"Iceland": "Iceland",
"Russia": "Russia"
}
# 서유럽 지역
COUNTRY_LANGUAGES_WEST_EUROPE = {
"Germany": "de",
"France": "fr",
"Italy": "it",
"Spain": "es",
"Netherlands": "nl",
"Belgium": "nl",
"Ireland": "en",
"Sweden": "sv",
"Switzerland": "de",
"Austria": "de",
"Portugal": "pt",
"Luxembourg": "fr",
"United Kingdom": "en"
}
COUNTRY_LOCATIONS_WEST_EUROPE = {
"Germany": "Germany",
"France": "France",
"Italy": "Italy",
"Spain": "Spain",
"Netherlands": "Netherlands",
"Belgium": "Belgium",
"Ireland": "Ireland",
"Sweden": "Sweden",
"Switzerland": "Switzerland",
"Austria": "Austria",
"Portugal": "Portugal",
"Luxembourg": "Luxembourg",
"United Kingdom": "United Kingdom"
}
# 중동/아프리카 지역
COUNTRY_LANGUAGES_ARAB_AFRICA = {
"South Africa": "en",
"Nigeria": "en",
"Kenya": "sw",
"Egypt": "ar",
"Morocco": "ar",
"Saudi Arabia": "ar",
"United Arab Emirates": "ar",
"Israel": "he"
}
COUNTRY_LOCATIONS_ARAB_AFRICA = {
"South Africa": "South Africa",
"Nigeria": "Nigeria",
"Kenya": "Kenya",
"Egypt": "Egypt",
"Morocco": "Morocco",
"Saudi Arabia": "Saudi Arabia",
"United Arab Emirates": "United Arab Emirates",
"Israel": "Israel"
}
# 아메리카 지역
COUNTRY_LANGUAGES_AMERICA = {
"United States": "en",
"Canada": "en",
"Mexico": "es",
"Brazil": "pt",
"Argentina": "es",
"Chile": "es",
"Colombia": "es",
"Peru": "es",
"Venezuela": "es"
}
COUNTRY_LOCATIONS_AMERICA = {
"United States": "United States",
"Canada": "Canada",
"Mexico": "Mexico",
"Brazil": "Brazil",
"Argentina": "Argentina",
"Chile": "Chile",
"Colombia": "Colombia",
"Peru": "Peru",
"Venezuela": "Venezuela"
}
# 지역 선택 리스트
REGIONS = [
"동아시아",
"동남아시아/오세아니아",
"동유럽",
"서유럽",
"중동/아프리카",
"아메리카"
]
@lru_cache(maxsize=100)
def translate_query(query, country):
try:
if is_english(query):
return query
if country in COUNTRY_LANGUAGES:
if country == "South Korea":
return query
target_lang = COUNTRY_LANGUAGES[country]
url = "https://translate.googleapis.com/translate_a/single"
params = {
"client": "gtx",
"sl": "auto",
"tl": target_lang,
"dt": "t",
"q": query
}
session = requests.Session()
retries = Retry(total=3, backoff_factor=0.5)
session.mount('https://', HTTPAdapter(max_retries=retries))
response = session.get(url, params=params, timeout=(5, 10))
translated_text = response.json()[0][0][0]
return translated_text
return query
except Exception as e:
print(f"번역 오류: {str(e)}")
return query
@lru_cache(maxsize=200)
def translate_to_korean(text):
try:
url = "https://translate.googleapis.com/translate_a/single"
params = {
"client": "gtx",
"sl": "auto",
"tl": "ko",
"dt": "t",
"q": text
}
session = requests.Session()
retries = Retry(total=3, backoff_factor=0.5)
session.mount('https://', HTTPAdapter(max_retries=retries))
response = session.get(url, params=params, timeout=(5, 10))
translated_text = response.json()[0][0][0]
return translated_text
except Exception as e:
print(f"한글 번역 오류: {str(e)}")
return text
def is_english(text):
return all(ord(char) < 128 for char in text.replace(' ', '').replace('-', '').replace('_', ''))
def is_korean(text):
return any('\uAC00' <= char <= '\uD7A3' for char in text)
def search_serphouse(query, country, page=1, num_result=10):
url = "https://api.serphouse.com/serp/live"
now = datetime.utcnow()
yesterday = now - timedelta(days=1)
date_range = f"{yesterday.strftime('%Y-%m-%d')},{now.strftime('%Y-%m-%d')}"
translated_query = translate_query(query, country)
payload = {
"data": {
"q": translated_query,
"domain": "google.com",
"loc": COUNTRY_LOCATIONS.get(country, "United States"),
"lang": COUNTRY_LANGUAGES.get(country, "en"),
"device": "desktop",
"serp_type": "news",
"page": "1",
"num": "100",
"date_range": date_range,
"sort_by": "date"
}
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": f"Bearer {API_KEY}"
}
try:
# 세션 설정 개선
session = requests.Session()
# 재시도 설정 강화
retries = Retry(
total=5, # 최대 재시도 횟수 증가
backoff_factor=1, # 재시도 간격 증가
status_forcelist=[500, 502, 503, 504, 429], # 재시도할 HTTP 상태 코드
allowed_methods=["POST"] # POST 요청에 대한 재시도 허용
)
# 타임아웃 설정 조정
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
# 타임아웃 값 증가 (connect timeout, read timeout)
response = session.post(
url,
json=payload,
headers=headers,
timeout=(30, 30) # 연결 타임아웃 30초, 읽기 타임아웃 30초
)
response.raise_for_status()
return {"results": response.json(), "translated_query": translated_query}
except requests.exceptions.Timeout:
return {
"error": "검색 시간이 초과되었습니다. 잠시 후 다시 시도해주세요.",
"translated_query": query
}
except requests.exceptions.RequestException as e:
return {
"error": f"검색 중 오류가 발생했습니다: {str(e)}",
"translated_query": query
}
except Exception as e:
return {
"error": f"예기치 않은 오류가 발생했습니다: {str(e)}",
"translated_query": query
}
def format_results_from_raw(response_data):
if "error" in response_data:
return "Error: " + response_data["error"], []
try:
results = response_data["results"]
translated_query = response_data["translated_query"]
news_results = results.get('results', {}).get('results', {}).get('news', [])
if not news_results:
return "검색 결과가 없습니다.", []
# 한국 도메인 및 한국 관련 키워드 필터링
korean_domains = ['.kr', 'korea', 'korean', 'yonhap', 'hankyung', 'chosun',
'donga', 'joins', 'hani', 'koreatimes', 'koreaherald']
korean_keywords = ['korea', 'korean', 'seoul', 'busan', 'incheon', 'daegu',
'gwangju', 'daejeon', 'ulsan', 'sejong']
filtered_articles = []
for idx, result in enumerate(news_results, 1):
url = result.get("url", result.get("link", "")).lower()
title = result.get("title", "").lower()
channel = result.get("channel", result.get("source", "")).lower()
# 한국 관련 컨텐츠 필터링
is_korean_content = any(domain in url or domain in channel for domain in korean_domains) or \
any(keyword in title.lower() for keyword in korean_keywords)
if not is_korean_content:
filtered_articles.append({
"index": idx,
"title": result.get("title", "제목 없음"),
"link": url,
"snippet": result.get("snippet", "내용 없음"),
"channel": result.get("channel", result.get("source", "알 수 없음")),
"time": result.get("time", result.get("date", "알 수 없는 시간")),
"image_url": result.get("img", result.get("thumbnail", "")),
"translated_query": translated_query
})
return "", filtered_articles
except Exception as e:
return f"결과 처리 중 오류 발생: {str(e)}", []
def serphouse_search(query, country):
response_data = search_serphouse(query, country)
return format_results_from_raw(response_data)
def search_and_display(query, country, articles_state, progress=gr.Progress()):
with ThreadPoolExecutor(max_workers=3) as executor:
progress(0, desc="검색어 번역 중...")
future_translation = executor.submit(translate_query, query, country)
translated_query = future_translation.result()
translated_display = f"**원본 검색어:** {query}\n**번역된 검색어:** {translated_query}" if translated_query != query else f"**검색어:** {query}"
progress(0.3, desc="검색 중...")
response_data = search_serphouse(query, country)
progress(0.6, desc="결과 처리 중...")
error_message, articles = format_results_from_raw(response_data)
outputs = []
outputs.append(gr.update(value="검색을 진행중입니다...", visible=True))
outputs.append(gr.update(value=translated_display, visible=True))
if error_message:
outputs.append(gr.update(value=error_message, visible=True))
for comp in article_components:
outputs.extend([
gr.update(visible=False), gr.update(), gr.update(),
gr.update(), gr.update()
])
articles_state = []
else:
outputs.append(gr.update(value="", visible=False))
if not error_message and articles:
futures = []
for article in articles:
future = executor.submit(translate_to_korean, article['snippet'])
futures.append((article, future))
progress(0.8, desc="번역 처리 중...")
for article, future in futures:
article['korean_summary'] = future.result()
total_articles = len(articles)
for idx, comp in enumerate(article_components):
progress((idx + 1) / total_articles, desc=f"결과 표시 중... {idx + 1}/{total_articles}")
if idx < len(articles):
article = articles[idx]
image_url = article['image_url']
image_update = gr.update(value=image_url, visible=True) if image_url and not image_url.startswith('data:image') else gr.update(value=None, visible=False)
outputs.extend([
gr.update(visible=True),
gr.update(value=f"### [{article['title']}]({article['link']})"),
image_update,
gr.update(value=f"**요약:** {article['snippet']}\n\n**한글 요약:** {article['korean_summary']}"),
gr.update(value=f"**출처:** {article['channel']} | **시간:** {article['time']}")
])
else:
outputs.extend([
gr.update(visible=False), gr.update(), gr.update(),
gr.update(), gr.update()
])
articles_state = articles
progress(1.0, desc="완료!")
outputs.append(articles_state)
outputs[0] = gr.update(value="", visible=False)
return outputs
def get_region_countries(region):
"""선택된 지역의 국가 및 언어 정보 반환"""
if region == "동아시아":
return COUNTRY_LOCATIONS_EAST_ASIA, COUNTRY_LANGUAGES_EAST_ASIA
elif region == "동남아시아/오세아니아":
return COUNTRY_LOCATIONS_SOUTHEAST_ASIA_OCEANIA, COUNTRY_LANGUAGES_SOUTHEAST_ASIA_OCEANIA
elif region == "동유럽":
return COUNTRY_LOCATIONS_EAST_EUROPE, COUNTRY_LANGUAGES_EAST_EUROPE
elif region == "서유럽":
return COUNTRY_LOCATIONS_WEST_EUROPE, COUNTRY_LANGUAGES_WEST_EUROPE
elif region == "중동/아프리카":
return COUNTRY_LOCATIONS_ARAB_AFRICA, COUNTRY_LANGUAGES_ARAB_AFRICA
elif region == "아메리카":
return COUNTRY_LOCATIONS_AMERICA, COUNTRY_LANGUAGES_AMERICA
return {}, {}
def search_global(query, region, articles_state_global):
"""지역별 검색 함수"""
status_msg = f"{region} 지역 검색을 시작합니다..."
all_results = []
outputs = [
gr.update(value=status_msg, visible=True),
gr.update(value=f"**검색어:** {query}", visible=True),
]
for _ in global_article_components:
outputs.extend([
gr.update(visible=False), gr.update(), gr.update(),
gr.update(), gr.update()
])
outputs.append([])
yield outputs
# 선택된 지역의 국가 정보 가져오기
locations, languages = get_region_countries(region)
total_countries = len(locations)
for idx, (country, location) in enumerate(locations.items(), 1):
try:
status_msg = f"{region} - {country} 검색 중... ({idx}/{total_countries} 국가)"
outputs[0] = gr.update(value=status_msg, visible=True)
yield outputs
error_message, articles = serphouse_search(query, country)
if not error_message and articles:
for article in articles:
article['source_country'] = country
article['region'] = region
all_results.extend(articles)
sorted_results = sorted(all_results, key=lambda x: x.get('time', ''), reverse=True)
seen_urls = set()
unique_results = []
for article in sorted_results:
url = article.get('link', '')
if url not in seen_urls:
seen_urls.add(url)
unique_results.append(article)
unique_results = unique_results[:MAX_GLOBAL_RESULTS]
outputs = [
gr.update(value=f"{region} - {idx}/{total_countries} 국가 검색 완료\n현재까지 발견된 뉴스: {len(unique_results)}건", visible=True),
gr.update(value=f"**검색어:** {query} | **지역:** {region}", visible=True),
]
for idx, comp in enumerate(global_article_components):
if idx < len(unique_results):
article = unique_results[idx]
image_url = article.get('image_url', '')
image_update = gr.update(value=image_url, visible=True) if image_url and not image_url.startswith('data:image') else gr.update(value=None, visible=False)
korean_summary = translate_to_korean(article['snippet'])
outputs.extend([
gr.update(visible=True),
gr.update(value=f"### [{article['title']}]({article['link']})"),
image_update,
gr.update(value=f"**요약:** {article['snippet']}\n\n**한글 요약:** {korean_summary}"),
gr.update(value=f"**출처:** {article['channel']} | **국가:** {article['source_country']} | **지역:** {article['region']} | **시간:** {article['time']}")
])
else:
outputs.extend([
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
gr.update()
])
outputs.append(unique_results)
yield outputs
except Exception as e:
print(f"Error searching {country}: {str(e)}")
continue
final_status = f"{region} 검색 완료! 총 {len(unique_results)}개의 뉴스가 발견되었습니다."
outputs[0] = gr.update(value=final_status, visible=True)
yield outputs
css = """
/* 전역 스타일 */
footer {visibility: hidden;}
/* 레이아웃 컨테이너 */
#status_area {
background: rgba(255, 255, 255, 0.9);
padding: 15px;
border-bottom: 1px solid #ddd;
margin-bottom: 20px;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}
#results_area {
padding: 10px;
margin-top: 10px;
}
/* 탭 스타일 */
.tabs {
border-bottom: 2px solid #ddd !important;
margin-bottom: 20px !important;
}
.tab-nav {
border-bottom: none !important;
margin-bottom: 0 !important;
}
.tab-nav button {
font-weight: bold !important;
padding: 10px 20px !important;
}
.tab-nav button.selected {
border-bottom: 2px solid #1f77b4 !important;
color: #1f77b4 !important;
}
/* 상태 메시지 */
#status_area .markdown-text {
font-size: 1.1em;
color: #2c3e50;
padding: 10px 0;
}
/* 기본 컨테이너 */
.group {
border: 1px solid #eee;
padding: 15px;
margin-bottom: 15px;
border-radius: 5px;
background: white;
}
/* 버튼 스타일 */
.primary-btn {
background: #1f77b4 !important;
border: none !important;
}
/* 입력 필드 */
.textbox {
border: 1px solid #ddd !important;
border-radius: 4px !important;
}
/* 프로그레스바 컨테이너 */
.progress-container {
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 6px;
background: #e0e0e0;
z-index: 1000;
}
/* 프로그레스바 */
.progress-bar {
height: 100%;
background: linear-gradient(90deg, #2196F3, #00BCD4);
box-shadow: 0 0 10px rgba(33, 150, 243, 0.5);
transition: width 0.3s ease;
animation: progress-glow 1.5s ease-in-out infinite;
}
/* 프로그레스 텍스트 */
.progress-text {
position: fixed;
top: 8px;
left: 50%;
transform: translateX(-50%);
background: #333;
color: white;
padding: 4px 12px;
border-radius: 15px;
font-size: 14px;
z-index: 1001;
box-shadow: 0 2px 5px rgba(0,0,0,0.2);
}
/* 프로그레스바 애니메이션 */
@keyframes progress-glow {
0% {
box-shadow: 0 0 5px rgba(33, 150, 243, 0.5);
}
50% {
box-shadow: 0 0 20px rgba(33, 150, 243, 0.8);
}
100% {
box-shadow: 0 0 5px rgba(33, 150, 243, 0.5);
}
}
/* 반응형 디자인 */
@media (max-width: 768px) {
.group {
padding: 10px;
margin-bottom: 15px;
}
.progress-text {
font-size: 12px;
padding: 3px 10px;
}
}
/* 로딩 상태 표시 개선 */
.loading {
opacity: 0.7;
pointer-events: none;
transition: opacity 0.3s ease;
}
/* 결과 컨테이너 애니메이션 */
.group {
transition: all 0.3s ease;
opacity: 0;
transform: translateY(20px);
}
.group.visible {
opacity: 1;
transform: translateY(0);
}
/* Examples 스타일링 */
.examples-table {
margin-top: 10px !important;
margin-bottom: 20px !important;
}
.examples-table button {
background-color: #f0f0f0 !important;
border: 1px solid #ddd !important;
border-radius: 4px !important;
padding: 5px 10px !important;
margin: 2px !important;
transition: all 0.3s ease !important;
}
.examples-table button:hover {
background-color: #e0e0e0 !important;
transform: translateY(-1px) !important;
box-shadow: 0 2px 5px rgba(0,0,0,0.1) !important;
}
.examples-table .label {
font-weight: bold !important;
color: #444 !important;
margin-bottom: 5px !important;
}
"""
def get_article_content(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# 일반적인 기사 본문 컨테이너 검색
article_body = None
possible_content_elements = [
soup.find('article'),
soup.find('div', class_='article-body'),
soup.find('div', class_='content'),
soup.find('div', {'id': 'article-body'})
]
for element in possible_content_elements:
if element:
article_body = element
break
if article_body:
# 불필요한 요소 제거
for tag in article_body.find_all(['script', 'style', 'nav', 'header', 'footer']):
tag.decompose()
content = ' '.join([p.get_text().strip() for p in article_body.find_all('p') if p.get_text().strip()])
else:
content = ' '.join([p.get_text().strip() for p in soup.find_all('p') if p.get_text().strip()])
return content
except Exception as e:
return f"Error crawling content: {str(e)}"
def respond(
url,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
if not url.startswith('http'):
history.append((url, "올바른 URL을 입력해주세요."))
return history
try:
# 기사 내용 추출
article_content = get_article_content(url)
# 2단계 프로세스를 위한 프롬프트 구성
translation_prompt = f"""다음 작업을 순차적으로 수행하세요:
1단계: 번역
아래 영문 기사를 한국어로 정확하게 번역하세요.
구분선: ===번역 시작===
{article_content}
구분선: ===번역 끝===
2단계: 기사 작성
위의 번역된 내용을 바탕으로 새로운 한국어 기사를 작성하세요.
다음 형식을 반드시 준수하세요:
- 제목: [헤드라인]
- 부제: [서브헤드라인]
- 본문: [기사 내용]
- 작성 규칙:
* 문장은 '다.'로 끝나야 함
* 신문 기사 형식 준수
* 단락 구분을 명확히 할 것
* 핵심 정보를 앞부분에 배치
* 인용구는 따옴표로 처리
각 단계는 '===번역===', '===기사==='로 구분하여 출력하세요.
"""
messages = [
{
"role": "system",
"content": """당신은 전문 번역가이자 기자입니다.
모든 작업은 반드시 다음 두 단계로 진행하고, 각 단계를 명확히 구분하여 출력해야 합니다:
1. 원문 번역: ===번역=== 표시 후 정확한 한국어 번역 제공
2. 기사 작성: ===기사=== 표시 후 번역본을 기반으로 한국어 뉴스 기사 작성
두 단계를 건너뛰거나 통합하지 말고 반드시 순차적으로 진행하세요."""
},
{"role": "user", "content": translation_prompt}
]
history.append((url, "번역 및 기사 작성을 시작합니다..."))
full_response = ""
current_section = ""
for message in client.chat.completions.create(
model="CohereForAI/c4ai-command-r-plus-08-2024",
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
messages=messages,
):
if hasattr(message.choices[0].delta, 'content'):
token = message.choices[0].delta.content
if token:
full_response += token
# 섹션 구분자 확인 및 포맷팅
if "===번역===" in token or "===기사===" in token:
current_section = token.strip()
full_response += "\n\n"
history[-1] = (url, full_response)
yield history
except Exception as e:
error_message = f"처리 중 오류가 발생했습니다: {str(e)}"
history.append((url, error_message))
yield history
return history
with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="NewsAI 서비스") as iface:
with gr.Tabs():
# 국가별 탭
with gr.Tab("국가별"):
gr.Markdown("검색어를 입력하고 원하는 국가(한국 제외)를를 선택하면, 검색어와 일치하는 24시간 이내 뉴스를 최대 100개 출력합니다.")
gr.Markdown("국가 선택후 검색어에 '한글'을 입력하면 현지 언어로 번역되어 검색합니다. 예: 'Taiwan' 국가 선택후 '삼성' 입력시 '三星'으로 자동 검색")
with gr.Column():
with gr.Row():
query = gr.Textbox(label="검색어")
country = gr.Dropdown(
choices=sorted(list(COUNTRY_LOCATIONS.keys())),
label="국가",
value="United States"
)
# Examples 추가
gr.Examples(
examples=[
"artificial intelligence",
"NVIDIA",
"OPENAI",
"META LLAMA",
"black forest labs",
"GOOGLE gemini",
"anthropic Claude",
"X.AI",
"HUGGINGFACE",
"HYNIX",
"Large Language model",
"CHATGPT",
"StabilityAI",
"MISTRALAI",
"QWEN",
"MIDJOURNEY",
"GPU"
],
inputs=query,
label="자주 사용되는 검색어"
)
status_message = gr.Markdown("", visible=True)
translated_query_display = gr.Markdown(visible=False)
search_button = gr.Button("검색", variant="primary")
progress = gr.Progress()
articles_state = gr.State([])
article_components = []
for i in range(100):
with gr.Group(visible=False) as article_group:
title = gr.Markdown()
image = gr.Image(width=200, height=150)
snippet = gr.Markdown()
info = gr.Markdown()
article_components.append({
'group': article_group,
'title': title,
'image': image,
'snippet': snippet,
'info': info,
'index': i,
})
# 전세계 탭
with gr.Tab("전세계"):
gr.Markdown("대륙별로 24시간 이내 뉴스를 검색합니다.")
with gr.Column():
with gr.Column(elem_id="status_area"):
with gr.Row():
query_global = gr.Textbox(label="검색어")
region_select = gr.Dropdown(
choices=REGIONS,
label="지역 선택",
value="동아시아"
)
search_button_global = gr.Button("검색", variant="primary")
status_message_global = gr.Markdown("")
translated_query_display_global = gr.Markdown("")
with gr.Column(elem_id="results_area"):
articles_state_global = gr.State([])
global_article_components = []
for i in range(MAX_GLOBAL_RESULTS):
with gr.Group(visible=False) as article_group:
title = gr.Markdown()
image = gr.Image(width=200, height=150)
snippet = gr.Markdown()
info = gr.Markdown()
global_article_components.append({
'group': article_group,
'title': title,
'image': image,
'snippet': snippet,
'info': info,
'index': i,
})
# AI 번역 탭 추가
with gr.Tab("AI 기사 생성"):
gr.Markdown("뉴스 URL을 입력하면 AI가 한국어로 번역하여 기사 형식으로 작성합니다.")
with gr.Column():
chatbot = gr.Chatbot(height=600)
with gr.Row():
url_input = gr.Textbox(
label="뉴스 URL",
placeholder="https://..."
)
with gr.Accordion("고급 설정", open=False):
system_message = gr.Textbox(
value="""You are a professional translator and journalist. Follow these steps strictly:
1. TRANSLATION
- Start with ===번역=== marker
- Provide accurate Korean translation
- Maintain original meaning and context
2. ARTICLE WRITING
- Start with ===기사=== marker
- Write a new Korean news article based on the translation
- Follow newspaper article format
- Use formal news writing style
- End sentences with '다.'
- Include headline and subheadline
- Organize paragraphs clearly
- Put key information first
- Use quotes appropriately
IMPORTANT:
- Must complete both steps in order
- Clearly separate each section with markers
- Never skip or combine steps""",
label="System message"
)
max_tokens = gr.Slider(
minimum=1,
maximum=7800,
value=7624,
step=1,
label="Max new tokens"
)
temperature = gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.7,
step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-P"
)
translate_button = gr.Button("기사 생성", variant="primary")
# 이벤트 연결
translate_button.click(
fn=respond,
inputs=[
url_input,
chatbot,
system_message,
max_tokens,
temperature,
top_p,
],
outputs=chatbot
)
# 이벤트 연결 부분
# 국가별 탭 이벤트
search_outputs = [status_message, translated_query_display, gr.Markdown(visible=False)]
for comp in article_components:
search_outputs.extend([
comp['group'], comp['title'], comp['image'],
comp['snippet'], comp['info']
])
search_outputs.append(articles_state)
search_button.click(
fn=search_and_display,
inputs=[query, country, articles_state],
outputs=search_outputs,
show_progress=True
)
# 전세계 탭 이벤트
global_search_outputs = [status_message_global, translated_query_display_global]
for comp in global_article_components:
global_search_outputs.extend([
comp['group'], comp['title'], comp['image'],
comp['snippet'], comp['info']
])
global_search_outputs.append(articles_state_global)
search_button_global.click(
fn=search_global,
inputs=[query_global, region_select, articles_state_global],
outputs=global_search_outputs,
show_progress=True
)
iface.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
auth=("ai","news"),
ssl_verify=False,
show_error=True
)