anisrashidov commited on
Commit
ee46c3b
·
verified ·
1 Parent(s): 81afb88

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +21 -0
  2. app.py +205 -0
  3. crawler.py +135 -0
  4. requirements.txt +21 -0
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.11-bookworm
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Copy only requirements first to leverage Docker layer caching
7
+ COPY requirements.txt /app/
8
+
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the rest of the application code
14
+ COPY . /app
15
+
16
+ # Switch to a non-root user (optional)
17
+ # RUN useradd -m appuser && chown -R appuser /app
18
+ # USER appuser
19
+
20
+ # Default command (optional, replace "app.py" with your app's entry point)
21
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import google.generativeai as genai
3
+ from crawler import extract_data
4
+ import time
5
+ import os
6
+ from dotenv import load_dotenv
7
+ import gradio as gr
8
+ # from together import Together
9
+ # from transformers import AutoModel, AutoTokenizer
10
+ # from sklearn.metrics.pairwise import cosine_similarity
11
+ # import torch
12
+ #
13
+ # load_dotenv("../.env")
14
+ # os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
+
16
+ # together_client = Together(
17
+ # api_key=os.getenv("TOGETHER_API_KEY"),
18
+ # )
19
+
20
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
21
+
22
+ gemini_query = genai.GenerativeModel('gemini-2.0-flash-exp')
23
+
24
+ gemini_summarizer = genai.GenerativeModel('gemini-1.5-flash')
25
+
26
+ perplexity_client = OpenAI(api_key=os.getenv("PERPLEXITY_API_KEY"), base_url="https://api.perplexity.ai")
27
+ # gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
28
+
29
+ # with torch.no_grad():
30
+ # model = AutoModel.from_pretrained('BM-K/KoSimCSE-roberta')
31
+ # tokenizer = AutoTokenizer.from_pretrained('BM-K/KoSimCSE-roberta')
32
+
33
+ # def cal_score(input_data):
34
+ # similarity_scores = []
35
+ # # Initialize model and tokenizer inside the function
36
+ # with torch.no_grad():
37
+ # inputs = tokenizer(input_data, padding=True, truncation=True, return_tensors="pt")
38
+ # outputs = model.get_input_embeddings()(inputs["input_ids"])
39
+
40
+ # for ind in range(1, outputs.size(0)):
41
+ # a, b = outputs[0], outputs[ind]
42
+
43
+ # a = a.reshape(1, -1)
44
+ # b = b.reshape(1, -1)
45
+
46
+ # a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
47
+ # b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
48
+
49
+ # similarity_scores.append(cosine_similarity(a_norm, b_norm)) # Scalar value
50
+ # return similarity_scores
51
+
52
+ def get_answers( query: str ):
53
+ context = extract_data(query, 1)
54
+ # if len(context) > 1:
55
+ # scores = cal_score( [query] + [answer['questionDetails'] for answer in context] )
56
+ # context = [context for _, context in sorted(zip(scores, context), key=lambda x: x[0], reverse=True)]
57
+ # mean_score = sum(scores) / len(scores)
58
+ # context = [ctx for score, ctx in zip(scores, context) if score >= mean_score]
59
+ return context
60
+
61
+ def get_gemini_query( message: str ):
62
+ print(">>> Starting gemini query generation...")
63
+
64
+ response = gemini_query.generate_content(message)
65
+
66
+ print("Finished gemini query generation: ", response.text)
67
+ return response.text
68
+
69
+ def get_naver_answers( message: str ):
70
+ print(">>> Starting naver extraction...")
71
+ print("Question: ", message)
72
+
73
+ if len(message) > 300:
74
+ message = get_gemini_query(f"{message}\n 위의 내용을 짧은 제목으로 요약합니다. 제목만 보여주세요. 대답하지 마세요. 한국어로만 답변해주세요!!!")
75
+
76
+ print( "Query: ", message)
77
+
78
+ context = get_answers( message )
79
+
80
+ sorted_answers = [
81
+ f"{index}. 질문: {answer['questionDetails']}" + '\n' + f" 답변: {'. '.join(answer['answers'])} " + '\n'
82
+ for (index, answer) in enumerate(context)
83
+ ]
84
+
85
+ document = '\n'.join(sorted_answers)
86
+ return document
87
+
88
+ def get_perplexity_answer( message: str ):
89
+ print(">>> Starting perplexity extraction...")
90
+ messages = [
91
+ {
92
+ "role": "system",
93
+ "content": (
94
+ "You are an artificial intelligence assistant and you need to "
95
+ "engage in a helpful, CONCISE, polite question-answer conversation with a user."
96
+ ),
97
+ },
98
+ {
99
+ "role": "user",
100
+ "content": (
101
+ message
102
+ ),
103
+ },
104
+ ]
105
+ response = perplexity_client.chat.completions.create(
106
+ model="llama-3.1-sonar-small-128k-online",
107
+ messages=messages
108
+ )
109
+ return response.choices[0].message.content
110
+
111
+
112
+ def chatFunction( history ):
113
+ # MAX_TOKEN_LIMIT = 58000
114
+ start_time = time.time()
115
+ message = history[-1][0]
116
+ # content = f' 질문과 답변으로 구성된 문서를 드리겠습니다. \
117
+ # 아래에 제공된 질문에 답하기 위해 중요한 정보를 추출하세요. \
118
+ # 한국어로만 답변하세요. 구체적이지만 간결하게 작성하세요. \
119
+ # 실제 보험상담사가 답변을 하듯이 친절한 답변을 해 주세요. \n 질문: {message}\n 문서: '
120
+
121
+ content = f' 보험설계사가 답을 줘서, 더 많은 질문이나 합당한 보험에 가입할 수 있도록 답변을 하려고 합니다. \
122
+ 문서에 있는 제3자 언급을 1인칭으로 ​​바꾸세요. 예를 들어 "KB손해보험 설계사 OOO입니다" 등 제3자가 언급된 경우 "보험기관입니다"로 대체합니다. \
123
+ 이러한 답변을 통해서 질문자가 이 답변을 보고 보험설계사에게 더 신뢰를 갖고 추가 질문이 있으면 물어볼 수 있도록 하려고 합니다. \
124
+ 실제 보험상담사가 답변을 하듯이 친절한 답변을 해 주세요. \n 질문: {message}\n 문서: '
125
+ naver_docs = get_naver_answers( message )
126
+ print(len(naver_docs))
127
+
128
+ # if len(naver_docs) > MAX_TOKEN_LIMIT:
129
+ # print("HERE")
130
+ # start_tmp = time.time()
131
+ # overlap = 200
132
+ # answers = []
133
+ # split_len = len(naver_docs) // ( ( len(naver_docs) - MAX_TOKEN_LIMIT ) // MAX_TOKEN_LIMIT + 2 ) + 1
134
+ # print(len(naver_docs) // split_len)
135
+ # for i in range( len(naver_docs) // split_len ):
136
+ # print("HERE: ", i)
137
+ # if i == 0:
138
+ # split = naver_docs[:split_len]
139
+ # else:
140
+ # split = naver_docs[i * split_len - overlap: (i + 1) * split_len]
141
+ # answer, _ = get_qwen_small_answer(f"Summarize important points in a paragraph, given the information below, using only Korean language. Give me only the summary!!! \n {split}")
142
+ # answers.append(answer)
143
+ # print("Answers: ", answers)
144
+ # naver_docs = '\n'.join(answers)
145
+ # naver_time_taken += time.time() - start_tmp
146
+
147
+ # print("Post chunking length: ", len(naver_docs) )
148
+ content += "\n Naver 문서: " + naver_docs
149
+
150
+
151
+ ### Extracting from Perplexity ###
152
+
153
+ perplexity_resp = get_perplexity_answer( message )
154
+ content += "\n Perplexity 문서: " + perplexity_resp
155
+
156
+ print(">>> Starting Gemini summarization...")
157
+
158
+ response = gemini_summarizer.generate_content( content, stream=True )
159
+
160
+ history[-1][1] = ''
161
+ ans = ""
162
+ for chunk in response:
163
+ ans += chunk.text.replace("*", "")
164
+ yield ans.strip() + "\n"
165
+ time.sleep(0.05)
166
+
167
+ print("Finished Gemini summarization")
168
+ print("Time taken: ", time.time() - start_time)
169
+
170
+ def set_user_response( message: str, history: list ):
171
+ history.append( [message, None] )
172
+ return '', history
173
+
174
+ ### Server-side code ###
175
+
176
+ from fastapi import FastAPI
177
+ from fastapi.responses import StreamingResponse
178
+ from pydantic import BaseModel
179
+ from fastapi.middleware.cors import CORSMiddleware
180
+
181
+ app = FastAPI()
182
+ app.add_middleware(
183
+ CORSMiddleware,
184
+ allow_origins=['*'],
185
+ allow_credentials=True,
186
+ allow_methods=["*"],
187
+ allow_headers=["*"],
188
+ )
189
+
190
+
191
+ @app.get("/")
192
+ async def root():
193
+ return {"message": "Hello World"}
194
+
195
+ class Message(BaseModel):
196
+ message: str
197
+
198
+ @app.post("/chat")
199
+ async def chat( message: Message ):
200
+ history = [[message.message, None]]
201
+ return StreamingResponse(
202
+ chatFunction(history),
203
+ media_type='text/event-stream'
204
+ )
205
+
crawler.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import re
3
+ import requests as r
4
+ from html2text import html2text
5
+ import tqdm
6
+ import time
7
+
8
+ from selenium import webdriver
9
+ from selenium.webdriver.common.by import By
10
+ from selenium.webdriver.chrome.service import Service
11
+ from selenium.webdriver.support.wait import WebDriverWait
12
+ from selenium.webdriver.support import expected_conditions as EC
13
+ from webdriver_manager.chrome import ChromeDriverManager
14
+ import multiprocessing
15
+
16
+
17
+ # def from_desktop_to_mobile_version(url):
18
+ # """Convert a desktop URL to its mobile version."""
19
+ # return url.replace("https://kin.naver.com", "https://m.kin.naver.com")
20
+
21
+ def initialize_webdriver():
22
+ """Initialize and return a WebDriver instance with headless options."""
23
+ options = webdriver.ChromeOptions()
24
+ options.add_argument("--headless=new")
25
+ options.add_argument("--disable-gpu")
26
+
27
+ service = Service(ChromeDriverManager().install())
28
+ return webdriver.Chrome(options=options, service=service)
29
+
30
+ def process_url(url):
31
+ driver = initialize_webdriver()
32
+ try:
33
+ print("Processing URL:", url)
34
+ driver.get(url)
35
+ closeBtn = WebDriverWait(driver, 5).until(
36
+ EC.element_to_be_clickable((By.CSS_SELECTOR, ".layer_promotion_choice_inner > .ico_close_layer")),
37
+ message="Close button not found."
38
+ )
39
+ if closeBtn:
40
+ print("Closing the popup")
41
+ closeBtn.click()
42
+ time.sleep(0.2)
43
+ print("CLOSED")
44
+
45
+ expandBtn = driver.find_element(By.ID, 'nextPageButton')
46
+ print("Expand button: ", expandBtn)
47
+ if expandBtn.is_displayed():
48
+ WebDriverWait(driver, 10).until(
49
+ EC.element_to_be_clickable(expandBtn),
50
+ message="Expand button wasn't loaded in time."
51
+ )
52
+ expandBtn.click()
53
+ print("Clicked the ex`pand button")
54
+ time.sleep(0.5)
55
+
56
+ html_content = driver.page_source
57
+ soup = BeautifulSoup(html_content, "html.parser")
58
+
59
+ answers = soup.find_all('div', {'class': 'answerDetail'})
60
+ answers = [html2text(str(answer.prettify())) for answer in answers]
61
+
62
+ title = soup.find('div', {'class': 'endTitleSection'}).text.strip()
63
+
64
+ questionDetails = soup.find('div', {'class': 'questionDetail'}).text.strip()
65
+
66
+ title = title.replace("질문", '').strip()
67
+
68
+ print("Answers extracted from: \n", url)
69
+ print(len(answers))
70
+ print('-'*60)
71
+
72
+ return {
73
+ "title": title,
74
+ "questionDetails": questionDetails,
75
+ "url": url,
76
+ "answers": answers
77
+ }
78
+ except Exception as e:
79
+ print(f"Error processing URL {url} \n\n\n{e}")
80
+ with open('error_urls.txt', 'w') as f:
81
+ f.write(url + '\n')
82
+ return {"title": '', "questionDetails": '', "url": url, "answers": ''}
83
+ finally:
84
+ driver.quit()
85
+
86
+
87
+ def get_answers(results_a_elements, query):
88
+ """Fetch answers for all the extracted result links."""
89
+ if not results_a_elements:
90
+ print("No results found.")
91
+ return []
92
+
93
+ print("Result links extracted: ", len(results_a_elements))
94
+
95
+ # Limit the number of parallel processes for better resource management
96
+ max_processes = 4
97
+
98
+ with multiprocessing.Pool(processes=max_processes) as pool:
99
+ results = pool.map(process_url, results_a_elements)
100
+
101
+ # results = []
102
+ # # answer_count = 0
103
+ # for url in tqdm.tqdm(results_a_elements):
104
+ # res = process_url(url)
105
+ # results.append(res)
106
+ # answer_count += len(res['answers'])
107
+ return results
108
+
109
+ def get_search_results(query, num_pages):
110
+ """Fetch search results for the given query from Naver 지식in."""
111
+ results = []
112
+ for page in range(1, num_pages + 1):
113
+ url = f"https://kin.naver.com/search/list.naver?query={query}&page={page}"
114
+ print("Starting the scraping process for:\n", url)
115
+
116
+ try:
117
+ response = r.get(url)
118
+ soup = BeautifulSoup(response.text, "html.parser")
119
+ results_a_elements = soup.find("ul", {"class": "basic1"}).find_all("a", {"class": "_searchListTitleAnchor"})
120
+ results_a_elements = [a.get('href') for a in results_a_elements if a.get("href")]
121
+ results += results_a_elements
122
+ except Exception as e:
123
+ print(f"Error while fetching search results: {e}")
124
+ return results
125
+
126
+ def extract_data(query, num_pages=150) -> list[dict[str, object]]:
127
+ results_a_elements = get_search_results(query, num_pages)
128
+ print(results_a_elements)
129
+ answers = get_answers(results_a_elements, query)
130
+ print("Total answers collected:", len(answers))
131
+ return answers
132
+
133
+
134
+ # if __name__ == "__main__":
135
+ # process_url("https://kin.naver.com/qna/detail.naver?d1id=4&dirId=401030203&docId=478845808&qb=67O07ZeYIOyImOyIoOu5hA==&enc=utf8&section=kin.qna_ency&rank=1&search_sort=0&spq=0")
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ selenium
3
+ webdriver-manager
4
+ fastapi[standard]
5
+ # uvicorn[standard]
6
+ html2text
7
+ # transformers
8
+ openai
9
+ google-genai
10
+ # transformers[torch]
11
+ # torch
12
+ # torchvision
13
+ # torchaudio
14
+ gradio
15
+ # scikit-learn
16
+ together
17
+ python-dotenv
18
+ openpyxl
19
+ # tonic-validate
20
+ google-generativeai
21
+ uvicorn