Ronaldo1111 commited on
Commit
0cf9446
·
verified ·
1 Parent(s): 5d2e82f

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +39 -34
  2. cleaned_dialog.json +0 -0
  3. requirements.txt +9 -7
app.py CHANGED
@@ -11,17 +11,18 @@ from langchain_community.llms import HuggingFacePipeline
11
  from langchain.prompts import PromptTemplate
12
  import gradio as gr
13
 
14
- # Step 1: 加载微信语料
15
- file_path = "wxid_818dcjgh2rie12_0_7235.json"
16
  with open(file_path, "r", encoding="utf-8") as f:
17
- raw_data = json.load(f)
18
 
19
- chunks = [item["msg"] for item in raw_data if item.get("is_sender") == 0 and item.get("msg")]
20
- docs = [Document(page_content=chunk) for chunk in chunks]
 
21
 
22
- # Step 2: 构建向量库
23
  embedding_model = SentenceTransformer("BAAI/bge-base-zh")
24
- embeddings = embedding_model.encode([doc.page_content for doc in docs], show_progress_bar=True)
25
 
26
  dimension = embeddings.shape[1]
27
  index = faiss.IndexFlatL2(dimension)
@@ -36,9 +37,9 @@ vectorstore = FAISS(
36
  docstore=InMemoryDocstore(docstore),
37
  index_to_docstore_id=index_to_docstore_id
38
  )
39
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
40
 
41
- # Step 3: 加载语言模型
42
  model_name = "Qwen/Qwen1.5-1.8B-Chat"
43
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
44
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).half().cuda().eval()
@@ -47,11 +48,11 @@ pipe = pipeline(
47
  "text-generation",
48
  model=model,
49
  tokenizer=tokenizer,
50
- max_new_tokens=64, # 缩短生成长度!
51
  temperature=0.8,
52
  top_p=0.9,
53
  do_sample=True,
54
- repetition_penalty=1.2, # 防止重复和啰嗦
55
  return_full_text=False,
56
  eos_token_id=tokenizer.eos_token_id,
57
  pad_token_id=tokenizer.pad_token_id,
@@ -59,68 +60,72 @@ pipe = pipeline(
59
 
60
  llm = HuggingFacePipeline(pipeline=pipe)
61
 
62
- # Step 4: Prompt 模板 + 聊天函数
63
  system_prompt = (
64
  "你是一个可爱的微信好友,语气要俏皮、有点可爱、适度调侃,不要太正式。"
65
- "请根据下面的对话上下文回答问题,切记不要重复指导语。"
66
  )
67
 
68
  prompt_template = PromptTemplate(
69
- input_variables=["system", "context", "question"],
70
- template="""
71
- {system}
72
 
73
- 模仿风格示例:
74
- 用户:你今天干嘛啦?
75
- 好友:在想你呀😚干嘛问我咩~
76
 
77
- 用户:你吃饭了吗?
78
- 好友:刚吃完,还差你一口哈哈哈🍚
79
-
80
- 以下是之前的微信聊天片段:
81
  {context}
82
 
83
  现在我说:
84
  {question}
85
 
86
- 请用微信好友的风格简短回答,不超过两句话:
87
  """
88
  )
89
 
90
-
91
  def chat(user_input, history):
92
  history = history or []
93
  context_text = "\n".join([
94
- f"用户:{msg['content']}" if msg['role'] == "user" else f"好友:{msg['content']}"
95
  for msg in history
96
  ])
97
 
98
- relevant_docs = retriever.get_relevant_documents(user_input)
99
- doc_snippets = "\n".join([doc.page_content for doc in relevant_docs])
 
 
 
 
 
100
 
101
- final_prompt = prompt_template.format(
 
102
  system=system_prompt,
103
- context=(doc_snippets + "\n" + context_text),
 
104
  question=user_input
105
  )
106
 
 
107
  try:
108
- reply = llm.invoke(final_prompt)
109
  except Exception as e:
110
  reply = f"哎呀出错了:{str(e)}"
111
 
 
112
  history.append({"role": "user", "content": user_input})
113
  history.append({"role": "assistant", "content": reply})
114
 
115
  return history, history
116
 
117
- # Step 5: Gradio 页面
118
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
119
  gr.Markdown("# 🎀 Sophia Chat Agent")
120
- gr.Markdown("欢迎来到 **Sophia Jr**,一个能模仿微信俏皮风格的 AI 好友。快来跟她聊聊吧!💬")
121
 
122
  chatbot = gr.Chatbot(label="Sophia", type="messages")
123
- msg = gr.Textbox(label="你想说啥~", placeholder="快点跟 Sophia 开始聊天吧!", lines=2)
124
  state = gr.State([
125
  {"role": "assistant", "content": "你好,我是 Sophia~你想聊啥?"}
126
  ])
 
11
  from langchain.prompts import PromptTemplate
12
  import gradio as gr
13
 
14
+ # ========= Step 1: 加载预处理好的对话对 =========
15
+ file_path = "cleaned_dialog_pairs.json" # 👈 你刚生成的清洗后数据文件
16
  with open(file_path, "r", encoding="utf-8") as f:
17
+ cleaned_pairs = json.load(f)
18
 
19
+ # 拼接为完整对话(用于向量化检索)
20
+ corpus = [f"用户:{pair['user']}\n好友:{pair['sophia']}" for pair in cleaned_pairs]
21
+ docs = [Document(page_content=entry) for entry in corpus]
22
 
23
+ # ========= Step 2: 构建向量库 =========
24
  embedding_model = SentenceTransformer("BAAI/bge-base-zh")
25
+ embeddings = embedding_model.encode(corpus, show_progress_bar=True)
26
 
27
  dimension = embeddings.shape[1]
28
  index = faiss.IndexFlatL2(dimension)
 
37
  docstore=InMemoryDocstore(docstore),
38
  index_to_docstore_id=index_to_docstore_id
39
  )
40
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
41
 
42
+ # ========= Step 3: 加载语言模型 =========
43
  model_name = "Qwen/Qwen1.5-1.8B-Chat"
44
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
45
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).half().cuda().eval()
 
48
  "text-generation",
49
  model=model,
50
  tokenizer=tokenizer,
51
+ max_new_tokens=64,
52
  temperature=0.8,
53
  top_p=0.9,
54
  do_sample=True,
55
+ repetition_penalty=1.2,
56
  return_full_text=False,
57
  eos_token_id=tokenizer.eos_token_id,
58
  pad_token_id=tokenizer.pad_token_id,
 
60
 
61
  llm = HuggingFacePipeline(pipeline=pipe)
62
 
63
+ # ========= Step 4: Prompt 模板 =========
64
  system_prompt = (
65
  "你是一个可爱的微信好友,语气要俏皮、有点可爱、适度调侃,不要太正式。"
66
+ "请模仿下面的风格回答用户的问题。"
67
  )
68
 
69
  prompt_template = PromptTemplate(
70
+ input_variables=["system", "examples", "context", "question"],
71
+ template="""{system}
 
72
 
73
+ 风格参考对话:
74
+ {examples}
 
75
 
76
+ 相关聊天语料片段:
 
 
 
77
  {context}
78
 
79
  现在我说:
80
  {question}
81
 
82
+ 你该怎么回复我?请用微信口语风格,最多两句话:
83
  """
84
  )
85
 
86
+ # ========= Step 5: 聊天函数 =========
87
  def chat(user_input, history):
88
  history = history or []
89
  context_text = "\n".join([
90
+ f"用户:{msg['content']}" if msg["role"] == "user" else f"好友:{msg['content']}"
91
  for msg in history
92
  ])
93
 
94
+ # 🔍 1. 检索与用户问题最相关的语料
95
+ retrieved_docs = retriever.get_relevant_documents(user_input)
96
+ retrieved_context = "\n".join([doc.page_content for doc in retrieved_docs])
97
+
98
+ # 📚 2. 示例风格从原始数据中截取(可调整数量)
99
+ example_pairs = cleaned_pairs[:3]
100
+ example_text = "\n".join([f"用户:{pair['user']}\n好友:{pair['sophia']}" for pair in example_pairs])
101
 
102
+ # 🧠 3. 拼接最终 prompt
103
+ prompt = prompt_template.format(
104
  system=system_prompt,
105
+ examples=example_text,
106
+ context=retrieved_context + "\n" + context_text,
107
  question=user_input
108
  )
109
 
110
+ # 🤖 4. 模型生成回复
111
  try:
112
+ reply = llm.invoke(prompt)
113
  except Exception as e:
114
  reply = f"哎呀出错了:{str(e)}"
115
 
116
+ # ✍️ 5. 更新历史(OpenAI风格格式)
117
  history.append({"role": "user", "content": user_input})
118
  history.append({"role": "assistant", "content": reply})
119
 
120
  return history, history
121
 
122
+ # ========= Step 6: Gradio 页面 =========
123
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
124
  gr.Markdown("# 🎀 Sophia Chat Agent")
125
+ gr.Markdown("欢迎来到 **Sophia Jr**,相信你也是马+7大家庭中的一员。快来和我聊聊吧!💬")
126
 
127
  chatbot = gr.Chatbot(label="Sophia", type="messages")
128
+ msg = gr.Textbox(label="你想说啥子哦~", placeholder="快点跟 Sophia 开始聊天吧!", lines=2)
129
  state = gr.State([
130
  {"role": "assistant", "content": "你好,我是 Sophia~你想聊啥?"}
131
  ])
cleaned_dialog.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,13 +1,15 @@
1
  langchain-huggingface
2
- huggingface-hub
3
- transformers>=4.36.2
4
- sentence-transformers
5
- faiss-cpu
6
  gradio==4.15.0
7
- langchain>=0.1.0
8
- langchain-community
9
- torch
10
  accelerate
11
  einops
12
  tiktoken
13
  transformers_stream_generator
 
 
 
 
 
 
 
 
 
 
1
  langchain-huggingface
 
 
 
 
2
  gradio==4.15.0
 
 
 
3
  accelerate
4
  einops
5
  tiktoken
6
  transformers_stream_generator
7
+ gradio>=4.15.0
8
+ transformers>=4.37.2
9
+ sentence-transformers
10
+ faiss-cpu
11
+ langchain>=0.1.14
12
+ langchain-community>=0.0.26
13
+ huggingface-hub
14
+ torch>=2.0
15
+