Ronaldo1111 commited on
Commit
56d531e
·
verified ·
1 Parent(s): e24ee40

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +174 -159
  2. corpus.json +0 -0
  3. dialog.json +0 -0
app.py CHANGED
@@ -1,159 +1,174 @@
1
- import json
2
- import numpy as np
3
- import faiss
4
- from sentence_transformers import SentenceTransformer
5
- from langchain_community.vectorstores import FAISS
6
- from langchain.docstore.document import Document
7
- from langchain_community.docstore.in_memory import InMemoryDocstore
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
10
- from langchain_community.llms import HuggingFacePipeline
11
- from langchain.prompts import PromptTemplate
12
- import gradio as gr
13
-
14
- # ========= Step 1: 加载预处理好的对话对 =========
15
- file_path = "cleaned_dialog_pairs.json" # 👈 你刚生成的清洗后数据文件
16
- with open(file_path, "r", encoding="utf-8") as f:
17
- cleaned_pairs = json.load(f)
18
-
19
- # 拼接为完整对话(用于向量化检索)
20
- corpus = [f"用户:{pair['user']}\n好友:{pair['sophia']}" for pair in cleaned_pairs]
21
- docs = [Document(page_content=entry) for entry in corpus]
22
-
23
- # ========= Step 2: 构建向量库 =========
24
- embedding_model = SentenceTransformer("BAAI/bge-base-zh")
25
- embeddings = embedding_model.encode(corpus, show_progress_bar=True)
26
-
27
- dimension = embeddings.shape[1]
28
- index = faiss.IndexFlatL2(dimension)
29
- index.add(np.array(embeddings))
30
-
31
- index_to_docstore_id = {i: str(i) for i in range(len(docs))}
32
- docstore = {str(i): doc for i, doc in enumerate(docs)}
33
-
34
- vectorstore = FAISS(
35
- embedding_function=HuggingFaceEmbeddings(model_name='BAAI/bge-base-zh'),
36
- index=index,
37
- docstore=InMemoryDocstore(docstore),
38
- index_to_docstore_id=index_to_docstore_id
39
- )
40
- retriever = vectorstore.as_retriever()
41
- # ========= Step 3: 加载语言模型 =========
42
- model_name = "Qwen/Qwen1.5-1.8B-Chat"
43
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
44
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).half().cuda().eval()
45
-
46
- pipe = pipeline(
47
- "text-generation",
48
- model=model,
49
- tokenizer=tokenizer,
50
- max_new_tokens=64,
51
- temperature=0.8,
52
- top_p=0.9,
53
- do_sample=True,
54
- repetition_penalty=1.2,
55
- return_full_text=False,
56
- eos_token_id=tokenizer.eos_token_id,
57
- pad_token_id=tokenizer.pad_token_id,
58
- )
59
-
60
- llm = HuggingFacePipeline(pipeline=pipe)
61
-
62
- # ========= Step 4: Prompt 模板 =========
63
- system_prompt = (
64
- "你是 Sophia,一个毒舌但可爱的微信好友。讲话要尖锐、嘴碎、有攻击性、但又让人上头。\n"
65
- "你会用emoji、口语、网络热词说话,像个懂事但不好惹的姐姐。\n"
66
- "你不是AI,也不是机器人,就是我微信里的老朋友。\n"
67
- "记住,你的风格是“毒中带撩”,回答要短、狠、有情绪。\n"
68
- "千万别客气,别解释太多,更别太官方,懂?"
69
- )
70
-
71
-
72
-
73
- prompt_template = PromptTemplate(
74
- input_variables=["system", "examples", "context", "question"],
75
- template="""
76
- {system}
77
-
78
- 👇以下是你平时说话的风格参考(不许忘):
79
- {examples}
80
-
81
- 👇这是我和你以前的聊天记录,你的语气都在这了:
82
- {context}
83
-
84
- 现在我问你:
85
- {question}
86
-
87
- 你要怎么回我?记得口语化、毒舌点、别啰嗦:
88
- """
89
- )
90
-
91
- import jieba
92
-
93
- def keyword_filter_retrieval(user_input, corpus_docs, fallback_retriever, k=3):
94
- # 使用 jieba 对中文输入进行分词
95
- keywords = list(set(jieba.lcut(user_input)))
96
-
97
- # 筛选含关键词的对话语料
98
- keyword_hits = [
99
- doc for doc in corpus_docs
100
- if any(kw in doc.page_content for kw in keywords if len(kw.strip()) > 1)
101
- ]
102
- # 如果匹配不足 k 条,就补齐
103
- if len(keyword_hits) >= k:
104
- return keyword_hits[:k]
105
- else:
106
- fallback_docs = fallback_retriever.get_relevant_documents(user_input)
107
- return keyword_hits + fallback_docs[:k - len(keyword_hits)]
108
-
109
- # ========= Step 5: 聊天函数 =========
110
- def chat(user_input, history):
111
- history = history or []
112
- context_text = "\n".join([
113
- f"用户:{msg['content']}" if msg["role"] == "user" else f"好友:{msg['content']}"
114
- for msg in history
115
- ])
116
-
117
- # 🔍 1. 检索与用户问题最相关的语料
118
- retrieved_docs = retriever.get_relevant_documents(user_input)
119
- retrieved_context = "\n".join([doc.page_content for doc in retrieved_docs])
120
-
121
- example_pairs = cleaned_pairs[:2]
122
- example_text = "\n".join([f"user:{pair['user']}\nsophia:{pair['sophia']}" for pair in example_pairs])
123
-
124
- # 🧠 3. 拼接最终 prompt
125
- prompt = prompt_template.format(
126
- system=system_prompt,
127
- examples=example_text,
128
- context=retrieved_context + "\n" + context_text,
129
- question=user_input
130
- )
131
-
132
- # 🤖 4. 模型生成回复
133
- try:
134
- reply = llm.invoke(prompt)
135
- except Exception as e:
136
- reply = f"哎呀出错了:{str(e)}"
137
-
138
- # ✍️ 5. 更新历史(OpenAI风格格式)
139
- history.append({"role": "user", "content": user_input})
140
- history.append({"role": "assistant", "content": reply})
141
-
142
- return history, history
143
-
144
- # ========= Step 6: Gradio 页面 =========
145
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
146
- gr.Markdown("# 🎀 Sophia Chat Agent")
147
- gr.Markdown("欢迎来到 **Sophia Jr**,相信你也是马+7大家庭中的一员。快来和我聊聊吧!💬")
148
-
149
- chatbot = gr.Chatbot(label="Sophia", type="messages")
150
- msg = gr.Textbox(label="你想说啥子哦~", placeholder="快点跟 Sophia 开始聊天吧!", lines=2)
151
- state = gr.State([
152
- {"role": "assistant", "content": "你好,我是 Sophia~你想聊啥?"}
153
- ])
154
- btn = gr.Button("发送")
155
-
156
- btn.click(chat, inputs=[msg, state], outputs=[chatbot, state])
157
- msg.submit(chat, inputs=[msg, state], outputs=[chatbot, state])
158
-
159
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import faiss
4
+ from sentence_transformers import SentenceTransformer
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain.docstore.document import Document
7
+ from langchain_community.docstore.in_memory import InMemoryDocstore
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
10
+ from langchain_community.llms import HuggingFacePipeline
11
+ from langchain.prompts import PromptTemplate
12
+ import gradio as gr
13
+
14
+ # ========= Step 1: 加载预处理好的对话对 =========
15
+ file_path = "cleaned_dialog_pairs.json" # 👈 你刚生成的清洗后数据文件
16
+ with open(file_path, "r", encoding="utf-8") as f:
17
+ cleaned_pairs = json.load(f)
18
+
19
+ # 拼接为完整对话(用于向量化检索)
20
+ corpus = [f"用户:{pair['user']}\n好友:{pair['sophia']}" for pair in cleaned_pairs]
21
+ docs = [Document(page_content=entry) for entry in corpus]
22
+
23
+ # ========= Step 2: 构建向量库 =========
24
+ embedding_model = SentenceTransformer("BAAI/bge-base-zh")
25
+ embeddings = embedding_model.encode(corpus, show_progress_bar=True)
26
+
27
+ dimension = embeddings.shape[1]
28
+ index = faiss.IndexFlatL2(dimension)
29
+ index.add(np.array(embeddings))
30
+
31
+ index_to_docstore_id = {i: str(i) for i in range(len(docs))}
32
+ docstore = {str(i): doc for i, doc in enumerate(docs)}
33
+
34
+ vectorstore = FAISS(
35
+ embedding_function=HuggingFaceEmbeddings(model_name='BAAI/bge-base-zh'),
36
+ index=index,
37
+ docstore=InMemoryDocstore(docstore),
38
+ index_to_docstore_id=index_to_docstore_id
39
+ )
40
+ retriever = vectorstore.as_retriever()
41
+ # ========= Step 3: 加载语言模型 =========
42
+ model_name = "Qwen/Qwen1.5-1.8B-Chat"
43
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
44
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).half().cuda().eval()
45
+
46
+ pipe = pipeline(
47
+ "text-generation",
48
+ model=model,
49
+ tokenizer=tokenizer,
50
+ max_new_tokens=64,
51
+ temperature=0.8,
52
+ top_p=0.9,
53
+ do_sample=True,
54
+ repetition_penalty=1.2,
55
+ return_full_text=False,
56
+ eos_token_id=tokenizer.eos_token_id,
57
+ pad_token_id=tokenizer.pad_token_id,
58
+ )
59
+
60
+ llm = HuggingFacePipeline(pipeline=pipe)
61
+
62
+ # ========= Step 4: Prompt 模板 =========
63
+ system_prompt = (
64
+ "你是 Sophia,一个毒舌但可爱的微信好友。讲话要尖锐、嘴碎、有攻击性、但又让人上头。\n"
65
+ "你会用emoji、口语、网络热词说话,像个懂事但不好惹的姐姐。\n"
66
+ "你不是AI,也不是机器人,就是我微信里的老朋友。\n"
67
+ "记住,你的风格是“毒中带撩”,回答要短、狠、有情绪。\n"
68
+ "千万别客气,别解释太多,更别太官方,懂?"
69
+ )
70
+
71
+
72
+
73
+ prompt_template = PromptTemplate(
74
+ input_variables=["system", "examples", "context", "question"],
75
+ template="""
76
+ {system}
77
+ 👇以下是你平时说话的风格参考(不许忘):
78
+ {examples}
79
+ 👇这是我和你以前的聊天记录,你的语气都在这了:
80
+ {context}
81
+ 现在我问你:
82
+ {question}
83
+ 你要怎么回我?记得口语化、毒舌点、别啰嗦:
84
+ """
85
+ )
86
+
87
+ import jieba
88
+ import jieba.analyse
89
+ from numpy.linalg import norm
90
+
91
+ def extract_keywords(text, topk=5):
92
+ return jieba.analyse.extract_tags(text, topK=topk)
93
+
94
+ def hybrid_retrieval(query, corpus_docs, faiss_index, embedding_model, k=3, kw_weight=2.0, vec_weight=1.0):
95
+ query_embedding = embedding_model.encode([query])[0]
96
+ keywords = extract_keywords(query, topk=5)
97
+
98
+ scored_docs = []
99
+ for i, doc in enumerate(corpus_docs):
100
+ doc_text = doc.page_content
101
+ keyword_score = sum(1 for kw in keywords if kw in doc_text)
102
+ doc_embedding = faiss_index.reconstruct(i)
103
+ vector_score = 1 / (norm(query_embedding - doc_embedding) + 1e-5)
104
+
105
+ total_score = kw_weight * keyword_score + vec_weight * vector_score
106
+ scored_docs.append((total_score, doc))
107
+
108
+ scored_docs.sort(key=lambda x: x[0], reverse=True)
109
+ return [doc for _, doc in scored_docs[:k]]
110
+
111
+ # ========= Step 5: 聊天函数 =========
112
+ def chat(user_input, history):
113
+ history = history or []
114
+
115
+ # 用户历史上下文对话
116
+ context_text = "\n".join([
117
+ f"用户:{msg['content']}" if msg["role"] == "user" else f"sophia:{msg['content']}"
118
+ for msg in history
119
+ ])
120
+
121
+ # 🔍 使用自定义融合检索函数
122
+ retrieved_docs = hybrid_retrieval(
123
+ query=user_input,
124
+ corpus_docs=docs, # 原始语料 Document 列表
125
+ faiss_index=index, # FAISS 向量索引
126
+ embedding_model=embedding_model, # SentenceTransformer 模型
127
+ k=3 # Top-K 检索条数
128
+ )
129
+ retrieved_context = "\n".join([doc.page_content for doc in retrieved_docs])
130
+
131
+ # 取前两个示例对话作为风格参考
132
+ example_pairs = cleaned_pairs[:2]
133
+ example_text = "\n".join([
134
+ f"user:{pair['user']}\nsophia:{pair['sophia']}"
135
+ for pair in example_pairs
136
+ ])
137
+
138
+ # 拼接 Prompt
139
+ prompt = prompt_template.format(
140
+ system=system_prompt,
141
+ examples=example_text,
142
+ context=retrieved_context + "\n" + context_text,
143
+ question=user_input
144
+ )
145
+
146
+ # 生成回复
147
+ try:
148
+ reply = llm.invoke(prompt)
149
+ except Exception as e:
150
+ reply = f"哎呀出错了:{str(e)}"
151
+
152
+ # 更新对话历史(OpenAI 风格)
153
+ history.append({"role": "user", "content": user_input})
154
+ history.append({"role": "assistant", "content": reply})
155
+
156
+ return history, history
157
+
158
+
159
+ # ========= Step 6: Gradio 页面 =========
160
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
161
+ gr.Markdown("# 🎀 Sophia Chat Agent")
162
+ gr.Markdown("欢迎来到 **Sophia Jr**,相信你也是马+7大家庭中的一员。快来和我聊聊吧!💬")
163
+
164
+ chatbot = gr.Chatbot(label="Sophia", type="messages")
165
+ msg = gr.Textbox(label="你想说啥子哦~", placeholder="快点跟 Sophia 开始聊天吧!", lines=2)
166
+ state = gr.State([
167
+ {"role": "assistant", "content": "你好,我是 Sophia~你想聊啥?"}
168
+ ])
169
+ btn = gr.Button("发送")
170
+
171
+ btn.click(chat, inputs=[msg, state], outputs=[chatbot, state])
172
+ msg.submit(chat, inputs=[msg, state], outputs=[chatbot, state])
173
+
174
+ demo.launch()
corpus.json ADDED
The diff for this file is too large to render. See raw diff
 
dialog.json ADDED
The diff for this file is too large to render. See raw diff