Spaces:

Ronaldo1111
/

Sophia

Sleeping

App Files Files Community

Ronaldo1111 commited on 19 days ago

Commit

0cf9446

verified ·

1 Parent(s): 5d2e82f

Upload 3 files

Browse files

Files changed (3) hide show

app.py +39 -34
cleaned_dialog.json +0 -0
requirements.txt +9 -7

app.py CHANGED Viewed

@@ -11,17 +11,18 @@ from langchain_community.llms import HuggingFacePipeline
 from langchain.prompts import PromptTemplate
 import gradio as gr
-# Step 1: 加载微信语料
-file_path = "wxid_818dcjgh2rie12_0_7235.json"
 with open(file_path, "r", encoding="utf-8") as f:
-    raw_data = json.load(f)
-chunks = [item["msg"] for item in raw_data if item.get("is_sender") == 0 and item.get("msg")]
-docs = [Document(page_content=chunk) for chunk in chunks]
-# Step 2: 构建向量库
 embedding_model = SentenceTransformer("BAAI/bge-base-zh")
-embeddings = embedding_model.encode([doc.page_content for doc in docs], show_progress_bar=True)
 dimension = embeddings.shape[1]
 index = faiss.IndexFlatL2(dimension)
@@ -36,9 +37,9 @@ vectorstore = FAISS(
     docstore=InMemoryDocstore(docstore),
     index_to_docstore_id=index_to_docstore_id
 )
-retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
-# Step 3: 加载语言模型
 model_name = "Qwen/Qwen1.5-1.8B-Chat"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).half().cuda().eval()
@@ -47,11 +48,11 @@ pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
-    max_new_tokens=64,  # 缩短生成长度！
     temperature=0.8,
     top_p=0.9,
     do_sample=True,
-    repetition_penalty=1.2,  # 防止重复和啰嗦
     return_full_text=False,
     eos_token_id=tokenizer.eos_token_id,
     pad_token_id=tokenizer.pad_token_id,
@@ -59,68 +60,72 @@ pipe = pipeline(
 llm = HuggingFacePipeline(pipeline=pipe)
-# Step 4: Prompt 模板 + 聊天函数
 system_prompt = (
     "你是一个可爱的微信好友，语气要俏皮、有点可爱、适度调侃，不要太正式。"
-    "请根据下面的对话上下文回答问题，切记不要重复指导语。"
 )
 prompt_template = PromptTemplate(
-    input_variables=["system", "context", "question"],
-    template="""
-{system}
-模仿风格示例：
-用户：你今天干嘛啦？
-好友：在想你呀😚干嘛问我咩～
-用户：你吃饭了吗？
-好友：刚吃完，还差你一口哈哈哈🍚
-以下是之前的微信聊天片段：
 {context}
 现在我说：
 {question}
-请用微信好友的风格简短回答，不超过两句话：
 """
 )
 def chat(user_input, history):
     history = history or []
     context_text = "\n".join([
-        f"用户：{msg['content']}" if msg['role'] == "user" else f"好友：{msg['content']}"
         for msg in history
     ])
-    relevant_docs = retriever.get_relevant_documents(user_input)
-    doc_snippets = "\n".join([doc.page_content for doc in relevant_docs])
-    final_prompt = prompt_template.format(
         system=system_prompt,
-        context=(doc_snippets + "\n" + context_text),
         question=user_input
     )
     try:
-        reply = llm.invoke(final_prompt)
     except Exception as e:
         reply = f"哎呀出错了：{str(e)}"
     history.append({"role": "user", "content": user_input})
     history.append({"role": "assistant", "content": reply})
     return history, history
-# Step 5: Gradio 页面
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎀 Sophia Chat Agent")
-    gr.Markdown("欢迎来到 **Sophia Jr**，一个能模仿微信俏皮风格的 AI 好友。快来跟她聊聊吧！💬")
     chatbot = gr.Chatbot(label="Sophia", type="messages")
-    msg = gr.Textbox(label="你想说啥～", placeholder="快点跟 Sophia 开始聊天吧！", lines=2)
     state = gr.State([
         {"role": "assistant", "content": "你好，我是 Sophia～你想聊啥？"}
     ])

 from langchain.prompts import PromptTemplate
 import gradio as gr
+# ========= Step 1: 加载预处理好的对话对 =========
+file_path = "cleaned_dialog_pairs.json"  # 👈 你刚生成的清洗后数据文件
 with open(file_path, "r", encoding="utf-8") as f:
+    cleaned_pairs = json.load(f)
+# 拼接为完整对话（用于向量化检索）
+corpus = [f"用户：{pair['user']}\n好友：{pair['sophia']}" for pair in cleaned_pairs]
+docs = [Document(page_content=entry) for entry in corpus]
+# ========= Step 2: 构建向量库 =========
 embedding_model = SentenceTransformer("BAAI/bge-base-zh")
+embeddings = embedding_model.encode(corpus, show_progress_bar=True)
 dimension = embeddings.shape[1]
 index = faiss.IndexFlatL2(dimension)
     docstore=InMemoryDocstore(docstore),
     index_to_docstore_id=index_to_docstore_id
 )
+retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
+# ========= Step 3: 加载语言模型 =========
 model_name = "Qwen/Qwen1.5-1.8B-Chat"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).half().cuda().eval()
     "text-generation",
     model=model,
     tokenizer=tokenizer,
+    max_new_tokens=64,
     temperature=0.8,
     top_p=0.9,
     do_sample=True,
+    repetition_penalty=1.2,
     return_full_text=False,
     eos_token_id=tokenizer.eos_token_id,
     pad_token_id=tokenizer.pad_token_id,
 llm = HuggingFacePipeline(pipeline=pipe)
+# ========= Step 4: Prompt 模板 =========
 system_prompt = (
     "你是一个可爱的微信好友，语气要俏皮、有点可爱、适度调侃，不要太正式。"
+    "请模仿下面的风格回答用户的问题。"
 )
 prompt_template = PromptTemplate(
+    input_variables=["system", "examples", "context", "question"],
+    template="""{system}
+风格参考对话：
+{examples}
+相关聊天语料片段：
 {context}
 现在我说：
 {question}
+你该怎么回复我？请用微信口语风格，最多两句话：
 """
 )
+# ========= Step 5: 聊天函数 =========
 def chat(user_input, history):
     history = history or []
     context_text = "\n".join([
+        f"用户：{msg['content']}" if msg["role"] == "user" else f"好友：{msg['content']}"
         for msg in history
     ])
+    # 🔍 1. 检索与用户问题最相关的语料
+    retrieved_docs = retriever.get_relevant_documents(user_input)
+    retrieved_context = "\n".join([doc.page_content for doc in retrieved_docs])
+    # 📚 2. 示例风格从原始数据中截取（可调整数量）
+    example_pairs = cleaned_pairs[:3]
+    example_text = "\n".join([f"用户：{pair['user']}\n好友：{pair['sophia']}" for pair in example_pairs])
+    # 🧠 3. 拼接最终 prompt
+    prompt = prompt_template.format(
         system=system_prompt,
+        examples=example_text,
+        context=retrieved_context + "\n" + context_text,
         question=user_input
     )
+    # 🤖 4. 模型生成回复
     try:
+        reply = llm.invoke(prompt)
     except Exception as e:
         reply = f"哎呀出错了：{str(e)}"
+    # ✍️ 5. 更新历史（OpenAI风格格式）
     history.append({"role": "user", "content": user_input})
     history.append({"role": "assistant", "content": reply})
     return history, history
+# ========= Step 6: Gradio 页面 =========
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎀 Sophia Chat Agent")
+    gr.Markdown("欢迎来到 **Sophia Jr**，相信你也是马＋7大家庭中的一员。快来和我聊聊吧！💬")
     chatbot = gr.Chatbot(label="Sophia", type="messages")
+    msg = gr.Textbox(label="你想说啥子哦～", placeholder="快点跟 Sophia 开始聊天吧！", lines=2)
     state = gr.State([
         {"role": "assistant", "content": "你好，我是 Sophia～你想聊啥？"}
     ])

cleaned_dialog.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,13 +1,15 @@
 langchain-huggingface
-huggingface-hub
-transformers>=4.36.2
-sentence-transformers
-faiss-cpu
 gradio==4.15.0
-langchain>=0.1.0
-langchain-community
-torch
 accelerate
 einops
 tiktoken
 transformers_stream_generator

 langchain-huggingface
 gradio==4.15.0
 accelerate
 einops
 tiktoken
 transformers_stream_generator
+gradio>=4.15.0
+transformers>=4.37.2
+sentence-transformers
+faiss-cpu
+langchain>=0.1.14
+langchain-community>=0.0.26
+huggingface-hub
+torch>=2.0