Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- Dockerfile +21 -0
- app.py +205 -0
- crawler.py +135 -0
- requirements.txt +21 -0
Dockerfile
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11.11-bookworm
|
2 |
+
|
3 |
+
# Set working directory
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
# Copy only requirements first to leverage Docker layer caching
|
7 |
+
COPY requirements.txt /app/
|
8 |
+
|
9 |
+
|
10 |
+
# Install dependencies
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
# Copy the rest of the application code
|
14 |
+
COPY . /app
|
15 |
+
|
16 |
+
# Switch to a non-root user (optional)
|
17 |
+
# RUN useradd -m appuser && chown -R appuser /app
|
18 |
+
# USER appuser
|
19 |
+
|
20 |
+
# Default command (optional, replace "app.py" with your app's entry point)
|
21 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import google.generativeai as genai
|
3 |
+
from crawler import extract_data
|
4 |
+
import time
|
5 |
+
import os
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import gradio as gr
|
8 |
+
# from together import Together
|
9 |
+
# from transformers import AutoModel, AutoTokenizer
|
10 |
+
# from sklearn.metrics.pairwise import cosine_similarity
|
11 |
+
# import torch
|
12 |
+
#
|
13 |
+
# load_dotenv("../.env")
|
14 |
+
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
15 |
+
|
16 |
+
# together_client = Together(
|
17 |
+
# api_key=os.getenv("TOGETHER_API_KEY"),
|
18 |
+
# )
|
19 |
+
|
20 |
+
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
21 |
+
|
22 |
+
gemini_query = genai.GenerativeModel('gemini-2.0-flash-exp')
|
23 |
+
|
24 |
+
gemini_summarizer = genai.GenerativeModel('gemini-1.5-flash')
|
25 |
+
|
26 |
+
perplexity_client = OpenAI(api_key=os.getenv("PERPLEXITY_API_KEY"), base_url="https://api.perplexity.ai")
|
27 |
+
# gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
28 |
+
|
29 |
+
# with torch.no_grad():
|
30 |
+
# model = AutoModel.from_pretrained('BM-K/KoSimCSE-roberta')
|
31 |
+
# tokenizer = AutoTokenizer.from_pretrained('BM-K/KoSimCSE-roberta')
|
32 |
+
|
33 |
+
# def cal_score(input_data):
|
34 |
+
# similarity_scores = []
|
35 |
+
# # Initialize model and tokenizer inside the function
|
36 |
+
# with torch.no_grad():
|
37 |
+
# inputs = tokenizer(input_data, padding=True, truncation=True, return_tensors="pt")
|
38 |
+
# outputs = model.get_input_embeddings()(inputs["input_ids"])
|
39 |
+
|
40 |
+
# for ind in range(1, outputs.size(0)):
|
41 |
+
# a, b = outputs[0], outputs[ind]
|
42 |
+
|
43 |
+
# a = a.reshape(1, -1)
|
44 |
+
# b = b.reshape(1, -1)
|
45 |
+
|
46 |
+
# a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
|
47 |
+
# b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
|
48 |
+
|
49 |
+
# similarity_scores.append(cosine_similarity(a_norm, b_norm)) # Scalar value
|
50 |
+
# return similarity_scores
|
51 |
+
|
52 |
+
def get_answers( query: str ):
|
53 |
+
context = extract_data(query, 1)
|
54 |
+
# if len(context) > 1:
|
55 |
+
# scores = cal_score( [query] + [answer['questionDetails'] for answer in context] )
|
56 |
+
# context = [context for _, context in sorted(zip(scores, context), key=lambda x: x[0], reverse=True)]
|
57 |
+
# mean_score = sum(scores) / len(scores)
|
58 |
+
# context = [ctx for score, ctx in zip(scores, context) if score >= mean_score]
|
59 |
+
return context
|
60 |
+
|
61 |
+
def get_gemini_query( message: str ):
|
62 |
+
print(">>> Starting gemini query generation...")
|
63 |
+
|
64 |
+
response = gemini_query.generate_content(message)
|
65 |
+
|
66 |
+
print("Finished gemini query generation: ", response.text)
|
67 |
+
return response.text
|
68 |
+
|
69 |
+
def get_naver_answers( message: str ):
|
70 |
+
print(">>> Starting naver extraction...")
|
71 |
+
print("Question: ", message)
|
72 |
+
|
73 |
+
if len(message) > 300:
|
74 |
+
message = get_gemini_query(f"{message}\n 위의 내용을 짧은 제목으로 요약합니다. 제목만 보여주세요. 대답하지 마세요. 한국어로만 답변해주세요!!!")
|
75 |
+
|
76 |
+
print( "Query: ", message)
|
77 |
+
|
78 |
+
context = get_answers( message )
|
79 |
+
|
80 |
+
sorted_answers = [
|
81 |
+
f"{index}. 질문: {answer['questionDetails']}" + '\n' + f" 답변: {'. '.join(answer['answers'])} " + '\n'
|
82 |
+
for (index, answer) in enumerate(context)
|
83 |
+
]
|
84 |
+
|
85 |
+
document = '\n'.join(sorted_answers)
|
86 |
+
return document
|
87 |
+
|
88 |
+
def get_perplexity_answer( message: str ):
|
89 |
+
print(">>> Starting perplexity extraction...")
|
90 |
+
messages = [
|
91 |
+
{
|
92 |
+
"role": "system",
|
93 |
+
"content": (
|
94 |
+
"You are an artificial intelligence assistant and you need to "
|
95 |
+
"engage in a helpful, CONCISE, polite question-answer conversation with a user."
|
96 |
+
),
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"role": "user",
|
100 |
+
"content": (
|
101 |
+
message
|
102 |
+
),
|
103 |
+
},
|
104 |
+
]
|
105 |
+
response = perplexity_client.chat.completions.create(
|
106 |
+
model="llama-3.1-sonar-small-128k-online",
|
107 |
+
messages=messages
|
108 |
+
)
|
109 |
+
return response.choices[0].message.content
|
110 |
+
|
111 |
+
|
112 |
+
def chatFunction( history ):
|
113 |
+
# MAX_TOKEN_LIMIT = 58000
|
114 |
+
start_time = time.time()
|
115 |
+
message = history[-1][0]
|
116 |
+
# content = f' 질문과 답변으로 구성된 문서를 드리겠습니다. \
|
117 |
+
# 아래에 제공된 질문에 답하기 위해 중요한 정보를 추출하세요. \
|
118 |
+
# 한국어로만 답변하세요. 구체적이지만 간결하게 작성하세요. \
|
119 |
+
# 실제 보험상담사가 답변을 하듯이 친절한 답변을 해 주세요. \n 질문: {message}\n 문서: '
|
120 |
+
|
121 |
+
content = f' 보험설계사가 답을 줘서, 더 많은 질문이나 합당한 보험에 가입할 수 있도록 답변을 하려고 합니다. \
|
122 |
+
문서에 있는 제3자 언급을 1인칭으로 바꾸세요. 예를 들어 "KB손해보험 설계사 OOO입니다" 등 제3자가 언급된 경우 "보험기관입니다"로 대체합니다. \
|
123 |
+
이러한 답변을 통해서 질문자가 이 답변을 보고 보험설계사에게 더 신뢰를 갖고 추가 질문이 있으면 물어볼 수 있도록 하려고 합니다. \
|
124 |
+
실제 보험상담사가 답변을 하듯이 친절한 답변을 해 주세요. \n 질문: {message}\n 문서: '
|
125 |
+
naver_docs = get_naver_answers( message )
|
126 |
+
print(len(naver_docs))
|
127 |
+
|
128 |
+
# if len(naver_docs) > MAX_TOKEN_LIMIT:
|
129 |
+
# print("HERE")
|
130 |
+
# start_tmp = time.time()
|
131 |
+
# overlap = 200
|
132 |
+
# answers = []
|
133 |
+
# split_len = len(naver_docs) // ( ( len(naver_docs) - MAX_TOKEN_LIMIT ) // MAX_TOKEN_LIMIT + 2 ) + 1
|
134 |
+
# print(len(naver_docs) // split_len)
|
135 |
+
# for i in range( len(naver_docs) // split_len ):
|
136 |
+
# print("HERE: ", i)
|
137 |
+
# if i == 0:
|
138 |
+
# split = naver_docs[:split_len]
|
139 |
+
# else:
|
140 |
+
# split = naver_docs[i * split_len - overlap: (i + 1) * split_len]
|
141 |
+
# answer, _ = get_qwen_small_answer(f"Summarize important points in a paragraph, given the information below, using only Korean language. Give me only the summary!!! \n {split}")
|
142 |
+
# answers.append(answer)
|
143 |
+
# print("Answers: ", answers)
|
144 |
+
# naver_docs = '\n'.join(answers)
|
145 |
+
# naver_time_taken += time.time() - start_tmp
|
146 |
+
|
147 |
+
# print("Post chunking length: ", len(naver_docs) )
|
148 |
+
content += "\n Naver 문서: " + naver_docs
|
149 |
+
|
150 |
+
|
151 |
+
### Extracting from Perplexity ###
|
152 |
+
|
153 |
+
perplexity_resp = get_perplexity_answer( message )
|
154 |
+
content += "\n Perplexity 문서: " + perplexity_resp
|
155 |
+
|
156 |
+
print(">>> Starting Gemini summarization...")
|
157 |
+
|
158 |
+
response = gemini_summarizer.generate_content( content, stream=True )
|
159 |
+
|
160 |
+
history[-1][1] = ''
|
161 |
+
ans = ""
|
162 |
+
for chunk in response:
|
163 |
+
ans += chunk.text.replace("*", "")
|
164 |
+
yield ans.strip() + "\n"
|
165 |
+
time.sleep(0.05)
|
166 |
+
|
167 |
+
print("Finished Gemini summarization")
|
168 |
+
print("Time taken: ", time.time() - start_time)
|
169 |
+
|
170 |
+
def set_user_response( message: str, history: list ):
|
171 |
+
history.append( [message, None] )
|
172 |
+
return '', history
|
173 |
+
|
174 |
+
### Server-side code ###
|
175 |
+
|
176 |
+
from fastapi import FastAPI
|
177 |
+
from fastapi.responses import StreamingResponse
|
178 |
+
from pydantic import BaseModel
|
179 |
+
from fastapi.middleware.cors import CORSMiddleware
|
180 |
+
|
181 |
+
app = FastAPI()
|
182 |
+
app.add_middleware(
|
183 |
+
CORSMiddleware,
|
184 |
+
allow_origins=['*'],
|
185 |
+
allow_credentials=True,
|
186 |
+
allow_methods=["*"],
|
187 |
+
allow_headers=["*"],
|
188 |
+
)
|
189 |
+
|
190 |
+
|
191 |
+
@app.get("/")
|
192 |
+
async def root():
|
193 |
+
return {"message": "Hello World"}
|
194 |
+
|
195 |
+
class Message(BaseModel):
|
196 |
+
message: str
|
197 |
+
|
198 |
+
@app.post("/chat")
|
199 |
+
async def chat( message: Message ):
|
200 |
+
history = [[message.message, None]]
|
201 |
+
return StreamingResponse(
|
202 |
+
chatFunction(history),
|
203 |
+
media_type='text/event-stream'
|
204 |
+
)
|
205 |
+
|
crawler.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
import re
|
3 |
+
import requests as r
|
4 |
+
from html2text import html2text
|
5 |
+
import tqdm
|
6 |
+
import time
|
7 |
+
|
8 |
+
from selenium import webdriver
|
9 |
+
from selenium.webdriver.common.by import By
|
10 |
+
from selenium.webdriver.chrome.service import Service
|
11 |
+
from selenium.webdriver.support.wait import WebDriverWait
|
12 |
+
from selenium.webdriver.support import expected_conditions as EC
|
13 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
14 |
+
import multiprocessing
|
15 |
+
|
16 |
+
|
17 |
+
# def from_desktop_to_mobile_version(url):
|
18 |
+
# """Convert a desktop URL to its mobile version."""
|
19 |
+
# return url.replace("https://kin.naver.com", "https://m.kin.naver.com")
|
20 |
+
|
21 |
+
def initialize_webdriver():
|
22 |
+
"""Initialize and return a WebDriver instance with headless options."""
|
23 |
+
options = webdriver.ChromeOptions()
|
24 |
+
options.add_argument("--headless=new")
|
25 |
+
options.add_argument("--disable-gpu")
|
26 |
+
|
27 |
+
service = Service(ChromeDriverManager().install())
|
28 |
+
return webdriver.Chrome(options=options, service=service)
|
29 |
+
|
30 |
+
def process_url(url):
|
31 |
+
driver = initialize_webdriver()
|
32 |
+
try:
|
33 |
+
print("Processing URL:", url)
|
34 |
+
driver.get(url)
|
35 |
+
closeBtn = WebDriverWait(driver, 5).until(
|
36 |
+
EC.element_to_be_clickable((By.CSS_SELECTOR, ".layer_promotion_choice_inner > .ico_close_layer")),
|
37 |
+
message="Close button not found."
|
38 |
+
)
|
39 |
+
if closeBtn:
|
40 |
+
print("Closing the popup")
|
41 |
+
closeBtn.click()
|
42 |
+
time.sleep(0.2)
|
43 |
+
print("CLOSED")
|
44 |
+
|
45 |
+
expandBtn = driver.find_element(By.ID, 'nextPageButton')
|
46 |
+
print("Expand button: ", expandBtn)
|
47 |
+
if expandBtn.is_displayed():
|
48 |
+
WebDriverWait(driver, 10).until(
|
49 |
+
EC.element_to_be_clickable(expandBtn),
|
50 |
+
message="Expand button wasn't loaded in time."
|
51 |
+
)
|
52 |
+
expandBtn.click()
|
53 |
+
print("Clicked the ex`pand button")
|
54 |
+
time.sleep(0.5)
|
55 |
+
|
56 |
+
html_content = driver.page_source
|
57 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
58 |
+
|
59 |
+
answers = soup.find_all('div', {'class': 'answerDetail'})
|
60 |
+
answers = [html2text(str(answer.prettify())) for answer in answers]
|
61 |
+
|
62 |
+
title = soup.find('div', {'class': 'endTitleSection'}).text.strip()
|
63 |
+
|
64 |
+
questionDetails = soup.find('div', {'class': 'questionDetail'}).text.strip()
|
65 |
+
|
66 |
+
title = title.replace("질문", '').strip()
|
67 |
+
|
68 |
+
print("Answers extracted from: \n", url)
|
69 |
+
print(len(answers))
|
70 |
+
print('-'*60)
|
71 |
+
|
72 |
+
return {
|
73 |
+
"title": title,
|
74 |
+
"questionDetails": questionDetails,
|
75 |
+
"url": url,
|
76 |
+
"answers": answers
|
77 |
+
}
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Error processing URL {url} \n\n\n{e}")
|
80 |
+
with open('error_urls.txt', 'w') as f:
|
81 |
+
f.write(url + '\n')
|
82 |
+
return {"title": '', "questionDetails": '', "url": url, "answers": ''}
|
83 |
+
finally:
|
84 |
+
driver.quit()
|
85 |
+
|
86 |
+
|
87 |
+
def get_answers(results_a_elements, query):
|
88 |
+
"""Fetch answers for all the extracted result links."""
|
89 |
+
if not results_a_elements:
|
90 |
+
print("No results found.")
|
91 |
+
return []
|
92 |
+
|
93 |
+
print("Result links extracted: ", len(results_a_elements))
|
94 |
+
|
95 |
+
# Limit the number of parallel processes for better resource management
|
96 |
+
max_processes = 4
|
97 |
+
|
98 |
+
with multiprocessing.Pool(processes=max_processes) as pool:
|
99 |
+
results = pool.map(process_url, results_a_elements)
|
100 |
+
|
101 |
+
# results = []
|
102 |
+
# # answer_count = 0
|
103 |
+
# for url in tqdm.tqdm(results_a_elements):
|
104 |
+
# res = process_url(url)
|
105 |
+
# results.append(res)
|
106 |
+
# answer_count += len(res['answers'])
|
107 |
+
return results
|
108 |
+
|
109 |
+
def get_search_results(query, num_pages):
|
110 |
+
"""Fetch search results for the given query from Naver 지식in."""
|
111 |
+
results = []
|
112 |
+
for page in range(1, num_pages + 1):
|
113 |
+
url = f"https://kin.naver.com/search/list.naver?query={query}&page={page}"
|
114 |
+
print("Starting the scraping process for:\n", url)
|
115 |
+
|
116 |
+
try:
|
117 |
+
response = r.get(url)
|
118 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
119 |
+
results_a_elements = soup.find("ul", {"class": "basic1"}).find_all("a", {"class": "_searchListTitleAnchor"})
|
120 |
+
results_a_elements = [a.get('href') for a in results_a_elements if a.get("href")]
|
121 |
+
results += results_a_elements
|
122 |
+
except Exception as e:
|
123 |
+
print(f"Error while fetching search results: {e}")
|
124 |
+
return results
|
125 |
+
|
126 |
+
def extract_data(query, num_pages=150) -> list[dict[str, object]]:
|
127 |
+
results_a_elements = get_search_results(query, num_pages)
|
128 |
+
print(results_a_elements)
|
129 |
+
answers = get_answers(results_a_elements, query)
|
130 |
+
print("Total answers collected:", len(answers))
|
131 |
+
return answers
|
132 |
+
|
133 |
+
|
134 |
+
# if __name__ == "__main__":
|
135 |
+
# process_url("https://kin.naver.com/qna/detail.naver?d1id=4&dirId=401030203&docId=478845808&qb=67O07ZeYIOyImOyIoOu5hA==&enc=utf8§ion=kin.qna_ency&rank=1&search_sort=0&spq=0")
|
requirements.txt
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
beautifulsoup4
|
2 |
+
selenium
|
3 |
+
webdriver-manager
|
4 |
+
fastapi[standard]
|
5 |
+
# uvicorn[standard]
|
6 |
+
html2text
|
7 |
+
# transformers
|
8 |
+
openai
|
9 |
+
google-genai
|
10 |
+
# transformers[torch]
|
11 |
+
# torch
|
12 |
+
# torchvision
|
13 |
+
# torchaudio
|
14 |
+
gradio
|
15 |
+
# scikit-learn
|
16 |
+
together
|
17 |
+
python-dotenv
|
18 |
+
openpyxl
|
19 |
+
# tonic-validate
|
20 |
+
google-generativeai
|
21 |
+
uvicorn
|