Spaces:

zhuhai111
/

Toursim-Test

Running

App Files Files Community

zhuhai111 commited on Mar 12

Commit

7cc8bc0

verified ·

1 Parent(s): 51879dd

Upload 43 files

Browse files

Files changed (43) hide show

.gitattributes +1 -3
.gradio/certificate.pem +31 -0
.huggingface/YAML +9 -0
README.md +40 -14
app.py +799 -0
config/config.yaml +72 -0
requirements.txt +16 -0
src/__init__.py +4 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/api/__init__.py +47 -0
src/api/__pycache__/__init__.cpython-310.pyc +0 -0
src/api/__pycache__/llm_api.cpython-310.pyc +0 -0
src/api/__pycache__/ollama_api.cpython-310.pyc +0 -0
src/api/__pycache__/routes.cpython-310.pyc +0 -0
src/api/__pycache__/search_api.cpython-310.pyc +0 -0
src/api/llm_api.py +319 -0
src/api/ollama_api.py +101 -0
src/api/routes.py +141 -0
src/api/search_api.py +263 -0
src/core/__pycache__/document_processor.cpython-310.pyc +0 -0
src/core/__pycache__/embeddings.cpython-310.pyc +0 -0
src/core/__pycache__/html_processor.cpython-310.pyc +0 -0
src/core/__pycache__/plan_generator.cpython-310.pyc +0 -0
src/core/__pycache__/ranking.cpython-310.pyc +0 -0
src/core/__pycache__/reranker.cpython-310.pyc +0 -0
src/core/_init_.py +14 -0
src/core/document_processor.py +80 -0
src/core/embeddings.py +41 -0
src/core/html_processor.py +195 -0
src/core/plan_generator.py +120 -0
src/core/ranking.py +114 -0
src/core/reranker.py +44 -0
src/retrieval/__pycache__/base.cpython-310.pyc +0 -0
src/retrieval/__pycache__/graph_rag.cpython-310.pyc +0 -0
src/retrieval/__pycache__/memo_rag.cpython-310.pyc +0 -0
src/retrieval/base.py +15 -0
src/retrieval/graph_rag.py +56 -0
src/retrieval/memo_rag.py +76 -0
src/utils/__init__.py +6 -0
src/utils/__pycache__/__init__.cpython-310.pyc +0 -0
src/utils/__pycache__/helpers.cpython-310.pyc +0 -0
src/utils/helpers.py +104 -0
src/utils/neo4j_helper.py +82 -0

.gitattributes CHANGED Viewed

@@ -23,13 +23,11 @@
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.huggingface/YAML ADDED Viewed

	@@ -0,0 +1,9 @@

+title: Tourism Planning Assistant
+emoji: 🌍
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.8.0
+app_file: app.py
+pinned: true
+license: mit

README.md CHANGED Viewed

@@ -1,14 +1,40 @@
----
-title: Toursim Test
-emoji: 📚
-colorFrom: blue
-colorTo: green
-sdk: gradio
-sdk_version: 5.20.1
-app_file: app.py
-pinned: false
-license: lgpl-3.0
-short_description: Toursim-Test
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Tourism Planning Assistant
+旅游规划助手是一个基于 RAG (Retrieval-Augmented Generation) 技术的应用，可以帮助用户规划旅行行程。
+## 功能特点
+- 支持基于用户需求生成个性化旅行计划
+- 使用搜索引擎获取最新的旅游信息
+- 支持多种语言模型（OpenAI, Deepseek）
+- 自动检索和排序相关信息
+- 支持相关图片展示
+## 部署到 Hugging Face Spaces
+该应用已优化，可以直接部署到 Hugging Face Spaces:
+1. 在 Hugging Face 创建一个新的 Space，选择 Gradio 作为 SDK
+2. 添加以下环境变量:
+   - `HF_BOCHA_API_KEY`: Bocha API 密钥
+   - `HF_OPENAI_API_KEY`: OpenAI API 密钥 (可选)
+   - `HF_DEEPSEEK_API_KEY`: Deepseek API 密钥 (可选)
+## 使用方法
+1. 输入您的旅行需求（例如："香港迪士尼一日游"）
+2. 选择旅行天数
+3. 点击"生成计划"按钮
+4. 获取个性化旅行计划，包含行程安排和相关参考链接
+## 技术栈
+- Gradio: 用户界面
+- Sentence Transformers: 文本嵌入
+- FlagEmbedding: 文本重排序
+- Hugging Face 模型: BGE-M3 和 BGE-Reranker-Large
+- Bocha API: 搜索引擎接口
+## 许可证
+MIT License

app.py ADDED Viewed

	@@ -0,0 +1,799 @@

+import gradio as gr
+import sys
+import os
+from pathlib import Path
+from typing import List
+# 添加项目根目录到 Python 路径
+sys.path.append(str(Path(__file__).parent))
+from src.api.search_api import BochaSearch
+from src.core.document_processor import DocumentProcessor
+from src.core.ranking import RankingSystem
+from src.core.plan_generator import PlanGenerator
+from src.core.embeddings import EmbeddingModel
+from src.core.reranker import Reranker
+from src.api.llm_api import DeepseekInterface, LLMInterface, OpenAIInterface
+from src.utils.helpers import load_config
+import logging
+# 设置日志
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class TravelRAGSystem:
+    def __init__(self):
+        self.config = load_config("config/config.yaml")
+        self.llm_instances = {}  # 存储不同provider的LLM实例
+        # 固定使用 standard 检索方法
+        self.retrieval_method = "standard"
+        self.init_llm_instances()
+        self.init_components()
+    def init_components(self):
+        # 获取默认提供商的配置
+        default_provider = self.config['llm_settings']['default_provider']
+        provider_config = next(
+            (p for p in self.config['llm_settings']['providers']
+             if p['name'] == default_provider),
+            None
+        )
+        if not provider_config:
+            raise ValueError(f"未找到默认提供商 {default_provider} 的配置")
+        # 初始化LLM实例
+        self.llm = self.init_llm(provider_config['name'], provider_config['model'])
+        self.search_engine = BochaSearch(
+            api_key=self.config['bocha_api_key'],
+            base_url=self.config['bocha_base_url']
+        )
+        self.doc_processor = DocumentProcessor(self.llm)
+        # 初始化嵌入模型 - 使用 Hugging Face 模型 ID
+        try:
+            self.embedding_model = EmbeddingModel(
+                model_name="BAAI/bge-m3"
+            )
+            logger.info("成功加载嵌入模型")
+        except Exception as e:
+            logger.error(f"加载嵌入模型失败: {str(e)}")
+            raise
+        # 初始化重排序器 - 使用 Hugging Face 模型 ID
+        try:
+            self.reranker = Reranker(
+                model_path="BAAI/bge-reranker-large"
+            )
+            logger.info("成功加载重排序模型")
+        except Exception as e:
+            logger.error(f"加载重排序模型失败: {str(e)}")
+            raise
+        self.ranking_system = RankingSystem(self.embedding_model, self.reranker)
+        self.plan_generator = PlanGenerator(self.llm)
+    def init_llm(self, provider: str, model: str):
+        if provider == "openai":
+            return OpenAIInterface(
+                api_key=self.config['openai_api_key'],
+                model=model
+            )
+        elif provider == "deepseek":
+            return DeepseekInterface(
+                api_key=self.config['deepseek_api_key'],
+                base_url=next(
+                    p['base_url'] for p in self.config['llm_settings']['providers']
+                    if p['name'] == 'deepseek'
+                ),
+                model=model
+            )
+        else:
+            raise ValueError(f"不支持的LLM提供商: {provider}")
+    def init_llm_instances(self):
+        """初始化所有启用的LLM实例"""
+        for provider in self.config['llm_settings']['providers']:
+            if provider.get('enabled', False):
+                try:
+                    if provider['name'] == "openai":
+                        self.llm_instances['openai'] = OpenAIInterface(
+                            api_key=self.config['openai_api_key'],
+                            model=provider['model']
+                        )
+                    else:
+                        self.llm_instances['deepseek'] = DeepseekInterface(
+                            api_key=self.config['deepseek_api_key'],
+                            base_url=provider['base_url'],
+                            model=provider['model']
+                        )
+                    logging.info(f"成功初始化 {provider['name']} LLM")
+                except Exception as e:
+                    logging.error(f"初始化 {provider['name']} LLM 失败: {str(e)}")
+    def get_llm(self, provider_name: str = None) -> LLMInterface:
+        """获取指定的LLM实例"""
+        if not provider_name:
+            provider_name = self.config['llm_settings']['default_provider']
+        if provider_name not in self.llm_instances:
+            raise ValueError(f"未找到或未启用的LLM提供商: {provider_name}")
+        return self.llm_instances[provider_name]
+    def process_query(
+        self,
+        query: str,
+        days: int,
+        llm_provider: str,
+        llm_model: str,
+        enable_images: bool = True,
+        retrieval_method: str = None
+    ) -> tuple:
+        try:
+            # 如果指定了新的检索方法,则切换
+            if retrieval_method and retrieval_method != self.retrieval_method:
+                self.set_retrieval_method(retrieval_method)
+            # 确保LLM提供商存在
+            if llm_provider not in self.llm_instances:
+                raise ValueError(f"LLM提供商 {llm_provider} 未启用或不可用，将使用默认提供商")
+            current_llm = self.llm_instances[llm_provider]
+            self.doc_processor = DocumentProcessor(current_llm)
+            self.plan_generator = PlanGenerator(current_llm)
+            # 确保查询包含天数
+            if days > 0:
+                query = f"{query} {days} days"
+            # 执行搜索
+            logger.info(f"执行搜索: {query}")
+            search_results = self.search_engine.search(query)
+            logger.info(f"搜索结果: {search_results}")
+            # 处理文档
+            passages = self.doc_processor.process_documents(search_results)
+            logger.info(f"处理后的文档: {passages}")
+            # 使用当前检索器进行检索和排序
+            if hasattr(self, 'retriever'):
+                final_ranked = self.retriever.retrieve(query, passages)
+            else:
+                # 使用默认的排序系统
+                initial_ranked = self.ranking_system.initial_ranking(query, passages)
+                final_ranked = self.ranking_system.rerank(query, initial_ranked)
+            # 生成计划
+            final_plan = self.plan_generator.generate_plan(query, final_ranked)
+            logger.info(f"生成的计划: {final_plan}")
+            # 修改准备参考来源的部分
+            # 创建表格的表头
+            table_header = "| Reference URL | Relevance Score | Retrieval Score | Rerank Score |\n| --- | --- | --- | --- |"
+            # 准备表格行
+            table_rows = []
+            for doc in final_ranked:
+                # 如果标题为空，使用URL作为标题
+                title = doc.get('title', '').strip()
+                if not title:
+                    from urllib.parse import urlparse
+                    domain = urlparse(doc['url']).netloc
+                    title = domain
+                # 创建表格行
+                row = (
+                    f"| [{title}]({doc['url']}) | "
+                    f"{doc.get('final_score', 0):.3f} | "
+                    f"{doc.get('retrieval_score', 0):.3f} | "
+                    f"{doc.get('rerank_score', 0):.3f} |"
+                )
+                table_rows.append(row)
+            # 组合表格
+            sources = table_header + "\n" + "\n".join(table_rows)
+            logger.info(f"参考来源: {sources}")
+            # 修改图片展示部分
+            image_html = ""
+            if enable_images:
+                try:
+                    # 增加搜索数量，因为要过滤
+                    images = self.search_engine.search_images(query, count=8)
+                    valid_images = []
+                    if images:
+                        # 过滤图片
+                        for img in images:
+                            img_url = img.get('url', '')
+                            if img_url and self.verify_image_url(img_url):
+                                valid_images.append(img_url)
+                                if len(valid_images) >= 3:  # 只需要3张有效图片
+                                    break
+                        if valid_images:  # 如果有有效图片
+                            image_html = """
+                            <div style="display: flex; flex-direction: column; gap: 15px;">
+                            """
+                            for img_url in valid_images:
+                                image_html += f"""
+                                <div style="border-radius: 8px; overflow: hidden;
+                                            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.3);">
+                                    <div style="position: relative; padding-top: 66.67%;">
+                                        <img src="{img_url}"
+                                             alt="旅行相关图片"
+                                             style="position: absolute; top: 0; left: 0; width: 100%;
+                                                    height: 100%; object-fit: cover; transition: transform 0.3s;"
+                                             onerror="this.style.display='none'">
+                                    </div>
+                                </div>
+                                """
+                            image_html += "</div>"
+                except Exception as e:
+                    logger.warning(f"获取图片时出现错误: {str(e)}")
+            # 美化计划内容的展示
+            plan_content = final_plan['plan']
+            # 替换标记符号
+            replacements = {
+                '###': '',  # 移除三个#
+                '##': '',   # 移除两个#
+                '# ': '',   # 移除单个#
+                '**': '',   # 移除所有**
+            }
+            for old, new in replacements.items():
+                plan_content = plan_content.replace(old, new)
+            # 处理标题和段落
+            paragraphs = plan_content.split('\n')
+            formatted_paragraphs = []
+            for p in paragraphs:
+                p = p.strip()
+                if not p:
+                    continue
+                if "Tour Overview" in p:
+                    # 主标题样式
+                    formatted_paragraphs.append(
+                        f'<h2 style="color: #f3f4f6; margin: 10px 0 12px 0; font-size: 1.15em; '
+                        f'font-weight: 600; letter-spacing: 0.01em; line-height: 1.3; '
+                        f'border-bottom: 1px solid rgba(99, 102, 241, 0.3); padding-bottom: 8px; '
+                        f'font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica Neue, sans-serif;">'
+                        f'📍 {p}</h2>'
+                    )
+                elif ":" in p and any(x in p for x in ["Date", "Destination", "Key Attractions"]):
+                    # 关键信息样式，加粗值部分
+                    key, value = p.split(":", 1)
+                    formatted_paragraphs.append(
+                        f'<div style="display: flex; align-items: start; margin: 4px 0; '
+                        f'padding-left: 4px;">'
+                        f'<span style="color: #818cf8; font-weight: 500; min-width: 70px; '
+                        f'font-size: 0.92em;">{key}:</span>'
+                        f'<span style="flex: 1; line-height: 1.4; color: #e2e8f0; margin-left: 8px; '
+                        f'font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica Neue, sans-serif; '
+                        f'font-size: 0.92em; font-weight: 600; letter-spacing: 0.005em;">{value.strip()}</span>'
+                        f'</div>'
+                    )
+                elif "Daily Itinerary" in p:
+                    # 主标题样式
+                    formatted_paragraphs.append(
+                        f'<h2 style="color: #f3f4f6; margin: 20px 0 12px 0; font-size: 1.15em; '
+                        f'font-weight: 600; letter-spacing: 0.01em; line-height: 1.3; '
+                        f'border-bottom: 1px solid rgba(99, 102, 241, 0.3); padding-bottom: 8px; '
+                        f'font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica Neue, sans-serif;">'
+                        f'🕒️ {p}</h2>'
+                    )
+                elif " - " in p:  # 时间段标题
+                    # 子标题样式
+                    formatted_paragraphs.append(
+                        f'<h3 style="color: #e2e8f0; margin: 14px 0 6px 0; font-size: 1.05em; '
+                        f'font-weight: 600; letter-spacing: 0.01em; line-height: 1.4; padding-bottom: 4px; '
+                        f'font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica Neue, sans-serif;">'
+                        f'🕒 {p}</h3>'
+                    )
+                elif p.startswith("Location") or p.startswith("Activity") or p.startswith("Transportation") or p.startswith("Specific Guidance"):
+                    # 信息样式
+                    key, value = p.split(":", 1)
+                    icon = {
+                        "Location": "📍",
+                        "Activity": "🎯",
+                        "Transportation": "🚇",
+                        "Specific Guidance": "🗺️"
+                    }.get(key, "•")
+                    formatted_paragraphs.append(
+                        f'<div style="display: flex; align-items: start; margin: 4px 0; padding-left: 4px;">'
+                        f'<span style="color: #818cf8; margin-right: 8px;">{icon}</span>'
+                        f'<span style="flex: 1; line-height: 1.4; color: #e2e8f0; '
+                        f'font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica Neue, sans-serif; '
+                        f'font-size: 0.92em; font-weight: 400; letter-spacing: 0.005em;">{value.strip()}</span>'
+                        f'</div>'
+                    )
+                else:
+                    # 普通段落样式
+                    formatted_paragraphs.append(
+                        f'<p style="margin: 8px 0; line-height: 1.5; color: #e2e8f0; '
+                        f'font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica Neue, sans-serif; '
+                        f'font-size: 0.95em; font-weight: 400; letter-spacing: 0.005em;">{p}</p>'
+                    )
+            plan_content = '\n'.join(formatted_paragraphs)
+            # 将所有内容包装在一个暗色主题的容器中
+            final_output = f"""
+            <div style="max-width: 100%; padding: 24px; background: rgba(17, 24, 39, 0.7);
+                        border-radius: 16px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2);">
+                <div style="margin-top: 20px;">
+                    {plan_content}
+                </div>
+            </div>
+            """
+            return final_output, sources, image_html
+        except Exception as e:
+            logger.error(f"Error processing query: {str(e)}")
+            return f"Sorry, an error occurred while processing your request: {str(e)}", "", ""
+    def verify_image_url(self, url: str) -> bool:
+        """验证图片URL是否可访问且符合要求"""
+        try:
+            import requests
+            from PIL import Image
+            import io
+            import numpy as np
+            from PIL import ImageDraw, ImageFont
+            # 获取图片
+            response = requests.get(url, timeout=3)
+            if response.status_code != 200:
+                return False
+            # 检查内容类型
+            content_type = response.headers.get('content-type', '')
+            if 'image' not in content_type.lower():
+                return False
+            # 读取图片
+            img = Image.open(io.BytesIO(response.content))
+            # 1. 检查图片尺寸
+            width, height = img.size
+            if width < 300 or height < 300:  # 过滤掉太小的图片
+                return False
+            # 2. 检查宽高比
+            aspect_ratio = width / height
+            if aspect_ratio < 0.5 or aspect_ratio > 2.0:  # 过滤掉比例不合适的图片
+                return False
+            # 3. 转换为numpy数组进行分析
+            img_array = np.array(img)
+            # 4. 检查图片是否过于单调（可能是纯文字图）
+            if len(img_array.shape) == 3:  # 确保是彩色图片
+                std = np.std(img_array)
+                if std < 30:  # 标准差太小说明图片太单调
+                    return False
+            # 5. 检测文字区域（简单实现）
+            # 转换为灰度图
+            if img.mode != 'L':
+                img_gray = img.convert('L')
+            else:
+                img_gray = img
+            # 计算边缘密度
+            from PIL import ImageFilter
+            edges = img_gray.filter(ImageFilter.FIND_EDGES)
+            edge_density = np.mean(np.array(edges))
+            # 如果边缘密度太高，可能包含大量文字
+            if edge_density > 30:
+                return False
+            # 6. 检查图片是否过于饱和（可能是广告图）
+            if len(img_array.shape) == 3:
+                hsv = img.convert('HSV')
+                saturation = np.array(hsv)[:,:,1]
+                if np.mean(saturation) > 200:  # 饱和度过高
+                    return False
+            return True
+        except Exception as e:
+            logger.warning(f"图片验证失败: {str(e)}")
+            return False
+    def _format_images_html(self, images: List[str]) -> str:
+        """格式化图片HTML展示"""
+        if not images:
+            return ""
+        # 使用flex布局来展示图片
+        html = """
+        <div style="display: flex; flex-wrap: wrap; gap: 10px; justify-content: center; margin-top: 20px;">
+        """
+        for img_url in images:
+            # 添加图片容器和加载失败处理
+            html += f"""
+            <div style="flex: 0 0 calc(50% - 10px); max-width: 300px; min-width: 200px;">
+                <img
+                    src="{img_url}"
+                    style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);"
+                    onerror="this.onerror=null; this.src='https://via.placeholder.com/300x200?text=Image+Not+Available';"
+                />
+            </div>
+            """
+        html += "</div>"
+        # 添加调试日志
+        logger.info(f"生成的图片HTML: {html[:200]}...")  # 只打印前200个字符
+        return html
+    def set_retrieval_method(self, method: str):
+        """切换检索方法"""
+        if method not in ["standard"]:
+            raise ValueError(f"不支持的检索方法: {method}")
+        self.retrieval_method = method
+        # 根据方法初始化对应的检索器
+        if method == "standard":
+            self.retriever = self.ranking_system
+def create_interface():
+    system = TravelRAGSystem()
+    # 获取已启用的提供商列表
+    enabled_providers = [
+        provider['name']
+        for provider in system.config['llm_settings']['providers']
+        if provider['enabled']
+    ]
+    # 创建提供商和模型的映射
+    provider_models = {
+        provider['name']: provider['models']
+        for provider in system.config['llm_settings']['providers']
+        if provider['enabled']
+    }
+    # 创建界面并设置自定义CSS
+    css = """
+    .gradio-container {
+        font-family: "PingFang SC", "Microsoft YaHei", sans-serif;
+    }
+    /* 针对所有英文文本 */
+    [class*="message-"] {
+        font-family: 'Times New Roman', serif !important;
+    }
+    /* 确保英文和数字使用 Times New Roman */
+    .gradio-container *:not(:lang(zh)) {
+        font-family: 'Times New Roman', serif !important;
+    }
+    @keyframes spin {
+        0% { transform: rotate(0deg); }
+        100% { transform: rotate(360deg); }
+    }
+    /* 隐藏数字输入框的上下箭头 */
+    input[type="number"]::-webkit-inner-spin-button,
+    input[type="number"]::-webkit-outer-spin-button {
+        -webkit-appearance: none;
+        margin: 0;
+    }
+    input[type="number"] {
+        -moz-appearance: textfield;
+    }
+    /* 隐藏默认的 processing 信息和箭头 */
+    .progress-text, .meta-text-center, .progress-container {
+        display: none !important;
+    }
+    /* 修改加载动画样式 */
+    .loading {
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        gap: 8px;
+        font-size: 1.2em;
+        color: rgb(192, 192, 255);
+    }
+    .loading::before {
+        content: '🌍';
+        display: inline-block;
+        animation: spin 2s linear infinite;
+        filter: brightness(1.5);  /* 让地球图标更亮 */
+    }
+    /* 调整 Gradio 默认加载动画的位置 */
+    .progress-text {
+        display: block !important;
+        order: 3;
+        margin-top: 8px;
+        opacity: 0.7;
+    }
+    .meta-text-center {
+        display: block !important;
+    }
+    /* 确保加载容器使用 flex 布局 */
+    .loading-container {
+        display: flex;
+        flex-direction: column;
+        align-items: center;
+    }
+    /* 隐藏滑块右侧的上下箭头 */
+    .num-input-plus, .num-input-minus {
+        display: none !important;
+    }
+    /* 隐藏所有滚动箭头 */
+    .scroll-hide,
+    .output-markdown,
+    .output-text,
+    .markdown-text,
+    .prose,
+    .gr-box,
+    .gr-panel {
+        -ms-overflow-style: none !important;
+        scrollbar-width: none !important;
+        overflow-y: hidden !important;
+        overflow: hidden !important;
+    }
+    .scroll-hide::-webkit-scrollbar,
+    .output-markdown::-webkit-scrollbar,
+    .output-text::-webkit-scrollbar,
+    .markdown-text::-webkit-scrollbar,
+    .prose::-webkit-scrollbar,
+    .gr-box::-webkit-scrollbar,
+    .gr-panel::-webkit-scrollbar {
+        display: none !important;
+        width: 0 !important;
+        height: 0 !important;
+    }
+    /* 修改加载动画容器样式 */
+    .loading-container {
+        overflow: hidden !important;
+        min-height: 60px;
+    }
+    /* 隐藏 Gradio 默认的滚动控件 */
+    .wrap.svelte-byatnx,
+    .contain.svelte-byatnx,
+    [class*='svelte'],
+    .gradio-container {
+        overflow: hidden !important;
+        overflow-y: hidden !important;
+    }
+    /* 禁用所有可能的滚动控件 */
+    ::-webkit-scrollbar {
+        display: none !important;
+        width: 0 !important;
+        height: 0 !important;
+    }
+    /* 移除 Group 组件的默认背景 */
+    .custom-group {
+        border: none !important;
+        background: none !important;
+        box-shadow: none !important;
+    }
+    .custom-group > div {
+        border: none !important;
+        background: none !important;
+        box-shadow: none !important;
+    }
+    /* 添加图片容器样式 */
+    .images-container {
+        margin-top: 20px;
+        padding: 10px;
+        background: #fff;
+        border-radius: 8px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    }
+    .images-container img {
+        transition: transform 0.3s ease;
+    }
+    .images-container img:hover {
+        transform: scale(1.05);
+    }
+    /* 确保图片容器可见 */
+    #component-13 {
+        min-height: 200px;
+        overflow: visible !important;
+    }
+    """
+    # 修改 JavaScript 加载状态文本
+    js = """
+    function showLoading() {
+        document.getElementById('loading_status').innerHTML = '<p class="loading">Generating your personalized travel plan...</p>';
+        return ['', ''];
+    }
+    """
+    with gr.Blocks(theme=gr.themes.Soft(), css=css) as interface:
+        gr.Markdown("""
+        # 🌟 Tourism Planning Assistant 🌟
+        Welcome to the Smart Travel Planning Assistant! Simply input your travel requirements, and we'll generate a personalized travel plan for you.
+        ### Instructions
+        1. Describe your travel needs in the input box (e.g., 'One-day trip to Hong Kong Disneyland')
+        2. Select the number of days for your plan
+        3. Click the "Generate Plan" button
+        """)
+        with gr.Row():
+            with gr.Column(scale=4):
+                llm_provider = gr.Dropdown(
+                    choices=enabled_providers,
+                    value=system.config['llm_settings']['default_provider'],
+                    label="Select LLM Provider"
+                )
+                llm_model = gr.Dropdown(
+                    choices=provider_models[system.config['llm_settings']['default_provider']],
+                    label="Select Model"
+                )
+                # 添加更新模型选择的函数
+                def update_model_choices(provider):
+                    return gr.Dropdown(choices=provider_models[provider])
+                # 设置提供商改变时的回调
+                llm_provider.change(
+                    fn=update_model_choices,
+                    inputs=[llm_provider],
+                    outputs=[llm_model]
+                )
+                query_input = gr.Textbox(
+                    label="Travel Requirements",
+                    placeholder="Please enter your travel requirements, e.g.: One-day trip to Hong Kong Disneyland",
+                    lines=2
+                )
+                days_input = gr.Slider(
+                    minimum=1,
+                    maximum=7,
+                    value=1,
+                    step=1,
+                    label="Number of Days"
+                )
+                # 添加显示图片的复选框
+                show_images = gr.Checkbox(
+                    label="Search Related Images",
+                    value=True,
+                    info="Whether to search and display related reference images"
+                )
+                # 移除 memorag 和 graphrag 选项,只保留 standard
+                retrieval_method = gr.Radio(
+                    choices=["standard"],
+                    value="standard",
+                    label="Retrieval Method",
+                    info="Choose different retrieval strategies",
+                    visible=False  # 由于只有一个选项,可以直接隐藏
+                )
+                submit_btn = gr.Button("Generate Plan", variant="primary")
+                loading_status = gr.Markdown("", elem_id="loading_status", show_label=False)
+                # 添加图片展示区域到左侧列
+                images_container = gr.HTML(
+                    value="",  # 确保初始值为空字符串
+                    visible=True,
+                    label="Related Images"
+                )
+                # 当复选框状态改变时更新图片区域的显示状态
+                show_images.change(
+                    fn=lambda x: "" if not x else "<div></div>",  # 当禁用图片时返回空字符串
+                    inputs=[show_images],
+                    outputs=[images_container]
+                )
+            with gr.Column(scale=6):
+                with gr.Tabs():
+                    with gr.TabItem("Travel Plan"):
+                        plan_output = gr.HTML(label="Generated Travel Plan", show_label=False)
+                    with gr.TabItem("References and Evaluation"):
+                        sources_output = gr.Markdown(label="References and Evaluation", show_label=False)
+        # 修改示例为英文
+        gr.Examples(
+            examples=[
+                ["One-day trip to Hong Kong Disneyland", 1],
+                ["Family trip to Hong Kong Ocean Park", 1],
+                ["Hong Kong Shopping and Food Tour", 2],
+                ["Hong Kong Cultural Experience Tour", 3]
+            ],
+            inputs=[query_input, days_input],
+            label="Example Queries"
+        )
+        def show_loading():
+            loading_html = "<div class='loading-container'><p class='loading'>Generating your personalized travel plan...</p></div>"
+            return loading_html, loading_html, "", ""
+        def process_with_images(query, days, llm_provider, llm_model, enable_images, retrieval_method):
+            plan_html, sources_md, images_html = system.process_query(
+                query, days, llm_provider, llm_model,
+                enable_images, retrieval_method
+            )
+            # 添加调试日志
+            logger.info(f"图片HTML长度: {len(images_html) if images_html else 0}")
+            return plan_html, sources_md, images_html
+        # 设置提交按钮事件
+        submit_btn.click(
+            fn=show_loading,
+            inputs=None,
+            outputs=[loading_status, plan_output, sources_output, images_container]
+        ).then(
+            fn=process_with_images,
+            inputs=[
+                query_input,
+                days_input,
+                llm_provider,
+                llm_model,
+                show_images,
+                retrieval_method
+            ],
+            outputs=[plan_output, sources_output, images_container]  # 确保顺序正确
+        ).then(
+            fn=lambda: "",
+            inputs=None,
+            outputs=[loading_status]
+        )
+        # 修改页脚为英文
+        gr.Markdown("""
+        ### 📝 Notes
+        - Plan generation may take some time, please be patient
+        - Queries should include specific locations and activity preferences
+        - All plans are AI-generated, please adjust according to actual circumstances
+        Powered by RAG for Tourism system © 2024
+        """)
+    return interface
+if __name__ == "__main__":
+    demo = create_interface()
+    # 使用 Hugging Face Spaces 环境变量
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,  # Hugging Face Spaces 已经提供了公开访问
+        debug=False
+    )

config/config.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+# API Keys - 在 Hugging Face Spaces 中使用环境变量方式设置
+google_api_key: ${HF_GOOGLE_API_KEY}
+google_cx: ${HF_GOOGLE_CX}
+bing_api_key: ${HF_BING_API_KEY}
+openai_api_key: ${HF_OPENAI_API_KEY}
+deepseek_api_key: ${HF_DEEPSEEK_API_KEY}
+bocha_api_key: ${HF_BOCHA_API_KEY}
+bocha_base_url: "https://api.bochaai.com"
+# LLM Settings
+llm_settings:
+  providers:
+    - name: "deepseek"
+      enabled: true
+      model: "deepseek-chat"
+      base_url: "https://api.deepseek.com"
+      api_key: ${deepseek_api_key}
+      models: ["deepseek-chat"]
+    - name: "openai"
+      enabled: true
+      model: "gpt-4o"
+      base_url: "https://api.openai.com/v1"  # 使用标准 OpenAI API URL
+      api_key: ${openai_api_key}
+      models: ["gpt-4o"]
+  default_provider: "deepseek"  # 默认使用的提供商
+# 检索设置
+retrieval_settings:
+  default_method: "standard"
+  methods:
+    - name: "standard"
+      enabled: true
+      model_settings:
+        embedding_model: "BAAI/bge-m3"  # 使用 Hugging Face 模型 ID
+        reranker_model: "BAAI/bge-reranker-large"  # 使用 Hugging Face 模型 ID
+# Search Settings
+max_results: 20
+language: "zh-CN"
+search_provider: "bocha"
+# Document Processing
+max_passage_length: 500
+min_passage_length: 100
+# Vector Search Settings
+embedding_model: "BAAI/bge-m3"  # 使用 Hugging Face 模型 ID
+reranker_model: "BAAI/bge-reranker-large"  # 使用 Hugging Face 模型 ID
+batch_size: 32
+use_gpu: true
+# Ranking Settings
+initial_top_k: 100
+final_top_k: 3
+retrieval_weight: 0.3
+rerank_weight: 0.7
+# Search Settings
+search_settings:
+  trusted_domains:
+    - 'discoverhongkong.com'
+    - 'tourism.gov.hk'
+    - 'hong-kong-travel.com'
+    - 'timeout.com.hk'
+    - 'openrice.com'
+    - 'lcsd.gov.hk'
+    - 'hkpl.gov.hk'
+proxy:
+  enabled: false
+  host: "127.0.0.1"
+  port: 8880

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+requests>=2.31.0
+beautifulsoup4>=4.12.0
+trafilatura>=1.6.1
+torch>=2.0.0
+transformers>=4.36.0
+openai>=1.3.0
+pyyaml>=6.0.1
+faiss-cpu>=1.7.4
+sentence-transformers>=2.2.0
+gradio>=4.8.0
+neo4j>=5.14.0
+langchain>=0.0.350
+matplotlib>=3.8.0
+Pillow>=9.0.0
+numpy>=1.22.0
+FlagEmbedding>=1.1.5

src/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .core.html_processor import HTMLProcessor
+from .core.document_processor import DocumentProcessor
+__all__ = ['HTMLProcessor', 'DocumentProcessor']

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (291 Bytes). View file

src/api/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import Dict, Any
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from .llm_api import DeepseekInterface
+from .search_api import GoogleSearch
+def create_app(config: Dict[str, Any] = None) -> FastAPI:
+    """
+    创建并配置 FastAPI 应用
+    """
+    app = FastAPI(
+        title="Travel RAG API",
+        description="Travel recommendation system using RAG",
+        version="1.0.0"
+    )
+    # 配置 CORS
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    # 初始化配置
+    if config:
+        app.state.config = config
+        # 初始化 LLM
+        app.state.llm = DeepseekInterface(
+            api_key=config['deepseek_api_key'],
+            base_url=config['llm_settings']['deepseek']['base_url'],
+            model=config['llm_settings']['deepseek']['models'][0]
+        )
+        # 初始化搜索引擎在 init_app 中完成
+        from .routes import init_app
+        init_app(app)
+    # 导入和注册路由
+    from .routes import router
+    app.include_router(router)
+    return app
+__all__ = ['create_app']

src/api/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.17 kB). View file

src/api/__pycache__/llm_api.cpython-310.pyc ADDED Viewed

Binary file (8.51 kB). View file

src/api/__pycache__/ollama_api.cpython-310.pyc ADDED Viewed

Binary file (3.59 kB). View file

src/api/__pycache__/routes.cpython-310.pyc ADDED Viewed

Binary file (2.91 kB). View file

src/api/__pycache__/search_api.cpython-310.pyc ADDED Viewed

Binary file (6.3 kB). View file

src/api/llm_api.py ADDED Viewed

	@@ -0,0 +1,319 @@

+from typing import List, Dict
+from abc import ABC, abstractmethod
+from openai import OpenAI
+import logging
+import httpx
+class LLMInterface(ABC):
+    @abstractmethod
+    def generate(self, prompt: str) -> str:
+        pass
+class DeepseekInterface(LLMInterface):
+    def __init__(self, api_key: str, base_url: str, model: str):
+        self.api_key = api_key
+        self.base_url = base_url
+        self.model = model
+        self.headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json"
+        }
+    def _build_system_prompt(self, role: str) -> str:
+        """构建系统提示词"""
+        roles = {
+            "summarizer": "You are a professional tourism content analyst, good at extracting and summarizing key tourism-related information. Please answer in English",
+            "planner": "You are a professional travel planner who is good at making detailed travel plans. Please answer in English"
+        }
+        return roles.get(role, "You are a professional AI assistant. Please answer in English")
+    def generate(self, prompt: str, role: str = "planner") -> str:
+        import requests
+        from requests.adapters import HTTPAdapter
+        from urllib3.util.retry import Retry
+        session = requests.Session()
+        retries = Retry(
+            total=3,
+            backoff_factor=1,
+            status_forcelist=[500, 502, 503, 504]
+        )
+        session.mount('https://', HTTPAdapter(max_retries=retries))
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": self._build_system_prompt(role)
+                },
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+            "temperature": 0.7,
+            "max_tokens": 2000
+        }
+        try:
+            response = session.post(
+                f"{self.base_url}/v1/chat/completions",
+                headers=self.headers,
+                json=payload,
+                timeout=(10, 60)
+            )
+            response.raise_for_status()
+            return response.json()['choices'][0]['message']['content']
+        except requests.exceptions.Timeout:
+            print("Deepseek API request timeout, retrying...")
+            return "Sorry, due to network issues, content generation is temporarily unavailable. Please try again later."
+        except requests.exceptions.RequestException as e:
+            print(f"Error calling Deepseek API: {str(e)}")
+            return "Sorry, an error occurred while generating content. Please try again later."
+    def summarize_document(self, content: str, title: str, url: str) -> str:
+        """使用 Deepseek 总结文档"""
+        prompt = f"""Please analyze the following tourism web content and generate a rich summary paragraph.
+Web Title: {title}
+Web Link: {url}
+Web Content:
+{content[:4000]}
+Requirements:
+1. The summary should be between 300-500 words
+2. Keep the most important tourism information (attractions, suggestions, tips, etc.)
+3. Use an objective tone
+4. Information should be accurate and practical
+5. Remove marketing and advertising content
+6. Maintain logical coherence
+Please return the summary content directly, without any other explanation."""
+        return self.generate(prompt, role="summarizer")
+    def generate_travel_plan(self, query: str, context: List[Dict]) -> str:
+        # 构建更结构化的上下文
+        context_text = "\n\n".join([
+            f"Source {i+1} ({doc.get('title', 'Unknown Title')}):\n{doc['passage']}"
+            for i, doc in enumerate(context)
+        ])
+        prompt = f"""As a professional travel planner, please create a detailed travel plan based on the user's needs and reference materials.
+User Needs: {query}
+Reference Materials:
+{context_text}
+Please provide the following content:
+1. Itinerary Overview (Overall arrangement and key attractions)
+2. Daily detailed itinerary (includes specific time, location, and transportation methods)
+3. Traffic suggestions (includes practical APP recommendations)
+4. Accommodation recommendations (includes specific areas and hotel suggestions)
+5. Food recommendations (includes specialty restaurants and snacks)
+6. Practical tips (weather, clothing, essential items, etc.)
+Requirements:
+1. The itinerary should be reasonable, considering the distance between attractions
+2. Provide specific time points
+3. Include detailed traffic guidance
+4. Suggestions should be specific and practical
+5. Consider actual conditions (e.g., opening hours of attractions)
+Please return the travel plan content directly, without any other explanation."""
+        return self.generate(prompt, role="planner")
+class OllamaInterface(LLMInterface):
+    def __init__(self, base_url: str, model: str):
+        self.base_url = base_url.rstrip('/')
+        self.model = model
+        self.headers = {
+            "Content-Type": "application/json"
+        }
+    def _build_system_prompt(self, role: str) -> str:
+        """构建系统提示词"""
+        roles = {
+            "summarizer": "You are a professional tourism content analyst, good at extracting and summarizing key tourism-related information. Please answer in English",
+            "planner": "You are a professional travel planner who is good at making detailed travel plans. Please answer in English"
+        }
+        return roles.get(role, "You are a professional AI assistant. Please answer in English")
+    def generate(self, prompt: str, role: str = "planner") -> str:
+        import requests
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": self._build_system_prompt(role)
+                },
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+            "stream": False
+        }
+        try:
+            response = requests.post(
+                f"{self.base_url}/api/chat",
+                headers=self.headers,
+                json=payload,
+                timeout=(10, 60)
+            )
+            response.raise_for_status()
+            return response.json()['message']['content']
+        except Exception as e:
+            print(f"Error calling Ollama API: {str(e)}")
+            return "Sorry, an error occurred while generating content. Please try again later."
+    def summarize_document(self, content: str, title: str, url: str) -> str:
+        """使用 Ollama 总结文档"""
+        prompt = f"""Please analyze the following tourism web content and generate a rich summary paragraph.
+Web Title: {title}
+Web Link: {url}
+Web Content:
+{content[:4000]}
+Requirements:
+1. The summary should be between 300-500 words
+2. Keep the most important tourism information (attractions, suggestions, tips, etc.)
+3. Use an objective tone
+4. Information should be accurate and practical
+5. Remove marketing and advertising content
+6. Maintain logical coherence
+Please return the summary content directly, without any other explanation."""
+        return self.generate(prompt, role="summarizer")
+    def generate_travel_plan(self, query: str, context: List[Dict]) -> str:
+        # 构建更结构化的上下文
+        context_text = "\n\n".join([
+            f"来源 {i+1} ({doc.get('title', '未知标题')}):\n{doc['passage']}"
+            for i, doc in enumerate(context)
+        ])
+        prompt = f"""As a professional travel planner, please create a detailed travel plan based on the user's needs and reference materials.
+User Needs: {query}
+Reference Materials:
+{context_text}
+Please provide the following content:
+1. Itinerary Overview (Overall arrangement and key attractions)
+2. Daily detailed itinerary (includes specific time, location, and transportation methods)
+3. Traffic suggestions (includes practical APP recommendations)
+4. Accommodation recommendations (includes specific areas and hotel suggestions)
+5. Food recommendations (includes specialty restaurants and snacks)
+6. Practical tips (weather, clothing, essential items, etc.)
+Requirements:
+1. The itinerary should be reasonable, considering the distance between attractions
+2. Provide specific time points
+3. Include detailed traffic guidance
+4. Suggestions should be specific and practical
+5. Consider actual conditions (e.g., opening hours of attractions)
+Please return the travel plan content directly, without any other explanation."""
+        return self.generate(prompt, role="planner")
+class OpenAIInterface(LLMInterface):
+    def __init__(self, api_key: str, model: str = "gpt-4o", base_url: str = "https://api.feidaapi.com/v1"):
+        self.api_key = api_key
+        self.model = model
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+    def _build_system_prompt(self, role: str) -> str:
+        """构建系统提示词"""
+        roles = {
+            "summarizer": "You are a professional tourism content analyst, good at extracting and summarizing key tourism-related information. Please answer in English",
+            "planner": "You are a professional travel planner who is good at making detailed travel plans. Please answer in English"
+        }
+        return roles.get(role, "You are a professional AI assistant. Please answer in English")
+    def generate(self, prompt: str, role: str = "planner") -> str:
+        try:
+            messages = [
+                {"role": "system", "content": self._build_system_prompt(role)},
+                {"role": "user", "content": prompt}
+            ]
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                temperature=0.7,
+                max_tokens=2000
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            logging.error(f"Error calling OpenAI API: {str(e)}")
+            return "Sorry, an error occurred while generating content. Please try again later."
+    def summarize_document(self, content: str, title: str, url: str) -> str:
+        """使用 OpenAI 总结文档"""
+        prompt = f"""Please analyze the following tourism web content and generate a rich summary paragraph.
+Web Title: {title}
+Web Link: {url}
+Web Content:
+{content[:4000]}
+Requirements:
+1. The summary should be between 300-500 words
+2. Keep the most important tourism information (attractions, suggestions, tips, etc.)
+3. Use an objective tone
+4. Information should be accurate and practical
+5. Remove marketing and advertising content
+6. Maintain logical coherence
+Please return the summary content directly, without any other explanation."""
+        return self.generate(prompt, role="summarizer")
+    def generate_travel_plan(self, query: str, context: List[Dict]) -> str:
+        # 构建更结构化的上下文
+        context_text = "\n\n".join([
+            f"Source {i+1} ({doc.get('title', 'Unknown Title')}):\n{doc['passage']}"
+            for i, doc in enumerate(context)
+        ])
+        prompt = f"""As a professional travel planner, please create a detailed travel plan based on the user's needs and reference materials.
+User Needs: {query}
+Reference Materials:
+{context_text}
+Please provide the following content:
+1. Itinerary Overview (Overall arrangement and key attractions)
+2. Daily detailed itinerary (includes specific time, location, and transportation methods)
+3. Traffic suggestions (includes practical APP recommendations)
+4. Accommodation recommendations (includes specific areas and hotel suggestions)
+5. Food recommendations (includes specialty restaurants and snacks)
+6. Practical tips (weather, clothing, essential items, etc.)
+Requirements:
+1. The itinerary should be reasonable, considering the distance between attractions
+2. Provide specific time points
+3. Include detailed traffic guidance
+4. Suggestions should be specific and practical
+5. Consider actual conditions (e.g., opening hours of attractions)
+Please return the travel plan content directly, without any other explanation."""
+        return self.generate(prompt, role="planner")

src/api/ollama_api.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from typing import List, Dict
+from .llm_api import LLMInterface
+import requests
+import json
+class OllamaInterface(LLMInterface):
+    def __init__(self, model_name: str = "qwen2.5:32b_ctx32k", base_url: str = "http://localhost:11434"):
+        self.model_name = model_name
+        self.base_url = base_url
+    def generate(self, prompt: str) -> str:
+        """使用 Ollama 生成响应"""
+        try:
+            response = requests.post(
+                f"{self.base_url}/api/generate",
+                json={
+                    "model": self.model_name,
+                    "prompt": prompt,
+                    "stream": False,
+                    "options": {
+                        "temperature": 0.7,
+                        "top_p": 0.9,
+                        "top_k": 40,
+                    }
+                }
+            )
+            response.raise_for_status()
+            return response.json()['response']
+        except Exception as e:
+            print(f"Ollama 生成错误: {e}")
+            return ""
+    def _build_system_prompt(self, role: str) -> str:
+        """构建系统提示词"""
+        roles = {
+            "summarizer": "你是一个专业的旅游内容分析师，擅长提取和总结旅游相关的关键信息。",
+            "planner": "你是一个专业的旅行规划师，擅长制定详细的旅行计划。"
+        }
+        return roles.get(role, "你是一个专业的AI助手。")
+    def summarize_document(self, content: str, title: str, url: str) -> str:
+        """使用 Qwen 2.5 总结文档"""
+        system_prompt = self._build_system_prompt("summarizer")
+        prompt = f"""{system_prompt}
+请分析以下旅游网页内容，生成一个信息丰富的总结段落。
+网页标题：{title}
+网页链接：{url}
+网页内容：
+{content[:4000]}
+要求：
+1. 总结长度控制在300-500字
+2. 保留最重要的旅游信息（景点、建议、提示等）
+3. 使用客观的语气
+4. 信息准确且实用
+5. 去除营销和广告内容
+6. 保持逻辑连贯性
+请直接返回总结内容，不需要其他说明。"""
+        return self.generate(prompt)
+    def generate_travel_plan(self, query: str, context: List[Dict]) -> str:
+        """使用 Qwen 2.5 生成旅行计划"""
+        system_prompt = self._build_system_prompt("planner")
+        context_text = "\n\n".join([
+            f"来源 {i+1}:\n{doc['passage']}"
+            for i, doc in enumerate(context)
+        ])
+        prompt = f"""{system_prompt}
+请根据以下信息，为用户制定一个详细的旅行计划。
+用户需求：{query}
+参考信息：
+{context_text}
+请提供以下内容：
+1. 行程概览
+2. 每日详细行程安排
+3. 交通建议
+4. 住宿推荐
+5. 美食推荐
+6. 注意事项和小贴士
+要求：
+1. 计划要详细且实用
+2. 时间安排要合理
+3. 建议要具体
+4. 考虑实际情况（如交通时间、景点开放时间等）
+5. 可以根据上下文补充合理的细节
+请直接返回旅行计划内容，不需要其他说明。"""
+        return self.generate(prompt)

src/api/routes.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from fastapi import APIRouter, HTTPException, Request
+from typing import Dict, Any
+from pydantic import BaseModel
+from src.core.document_processor import DocumentProcessor
+from src.core.ranking import RankingSystem
+from src.core.plan_generator import PlanGenerator
+from src.core.embeddings import EmbeddingModel
+from src.core.reranker import Reranker
+from src.api.search_api import GoogleSearch, BochaSearch
+from src.utils.helpers import setup_proxy
+from src.api.llm_api import DeepseekInterface
+import logging
+logger = logging.getLogger(__name__)
+# 创建路由器
+router = APIRouter(
+    tags=["travel"]    # Swagger UI 的标签
+)
+# 添加根路由
+@router.get("/")
+async def root():
+    return {
+        "message": "欢迎使用旅游推荐系统 API",
+        "status": "运行正常",
+        "version": "1.0.0",
+        "endpoints": {
+            "健康检查": "/health",
+            "旅游推荐": "/api/v1/recommend",
+            "API文档": "/docs"
+        }
+    }
+# 请求模型
+class TravelQuery(BaseModel):
+    query: str
+    location: str = None
+    max_results: int = 10
+# 响应模型
+class TravelResponse(BaseModel):
+    recommendations: list
+    query: str
+    metadata: Dict[str, Any]
+def init_app(app):
+    """初始化应用"""
+    # 设置代理并获取代理状态
+    proxies, proxy_available = setup_proxy(app.state.config)
+    app.state.proxies = proxies
+    # 根据代理状态选择搜索引擎
+    if proxy_available:
+        app.state.search = GoogleSearch(
+            api_key=app.state.config['google_api_key'],
+            cx=app.state.config['google_cx'],
+            proxies=proxies
+        )
+        logging.info("使用 Google 搜索引擎")
+    else:
+        app.state.search = BochaSearch(
+            api_key=app.state.config['bocha_api_key'],
+            base_url=app.state.config['bocha_base_url']
+        )
+        logging.info("使用博查搜索引擎")
+    # 初始化 Deepseek LLM
+    app.state.llm = DeepseekInterface(
+        api_key=app.state.config['deepseek_api_key'],
+        base_url=app.state.config['llm_settings']['deepseek']['base_url'],
+        model=app.state.config['llm_settings']['deepseek']['models'][0]
+    )
+    # ... 其他初始化代码 ...
+@router.post("/api/v1/recommend", response_model=TravelResponse)
+async def get_travel_recommendations(query: TravelQuery, request: Request):
+    """
+    获取旅游推荐
+    """
+    logger.info(f"收到查询请求: {query.dict()}")
+    try:
+        # 使用已配置代理的搜索实例
+        search = request.app.state.search
+        llm = request.app.state.llm
+        # 执行搜索
+        logger.info("开始执行搜索...")
+        search_results = search.search(query.query)
+        logger.info(f"搜索完成，获得 {len(search_results)} 条结果")
+        # 处理文档
+        doc_processor = DocumentProcessor(llm)
+        passages = doc_processor.process_documents(search_results)
+        passages = [{'passage': p} for p in passages]
+        logging.info(f"Passages structure: {passages[:1]}")  # 打印第一个元素的结构
+        # 初始化排序系统
+        embedding_model = EmbeddingModel("BAAI/bge-m3")
+        reranker = Reranker("BAAI/bge-reranker-large")
+        ranking_system = RankingSystem(embedding_model, reranker)
+        # 两阶段排序
+        initial_ranked = ranking_system.initial_ranking(
+            query.query,
+            passages,
+            10  # initial_top_k
+        )
+        final_ranked = ranking_system.rerank(
+            query.query,
+            initial_ranked,
+            3   # final_top_k
+        )
+        # 生成计划
+        plan_generator = PlanGenerator(llm)
+        final_plan = plan_generator.generate_plan(query.query, final_ranked)
+        return TravelResponse(
+            recommendations=[final_plan['plan']],
+            query=query.query,
+            metadata={
+                "location": query.location,
+                "max_results": query.max_results,
+                "sources": final_plan['sources']
+            }
+        )
+    except Exception as e:
+        logger.error(f"处理请求时发生错误: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+# 健康检查端点
+@router.get("/health")
+async def health_check():
+    """
+    API 健康检查端点
+    """
+    return {"status": "healthy"}

src/api/search_api.py ADDED Viewed

	@@ -0,0 +1,263 @@

+from typing import List, Dict
+import requests
+from abc import ABC, abstractmethod
+import logging
+import json
+logging.basicConfig(level=logging.DEBUG)
+class SearchEngine(ABC):
+    def __init__(self):
+        # 广告域名黑名单
+        self.ad_domains = {
+            'ads.google.com',
+            'doubleclick.net',
+            'affiliate.',
+            '.ads.',
+            'promotion.',
+            'sponsored.',
+            'partner.',
+            'tracking.',
+            '.shop.',
+            'taobao.com',
+            'tmall.com',
+            'jd.com',
+            'mafengwo.cn',  # 蚂蜂窝
+            'ctrip.com',  # 携程
+            'tour.aoyou.com',  # 同程
+            'wannar.com'  # 玩哪儿
+        }
+    def is_ad_url(self, url: str) -> bool:
+        """检查URL是否为广告链接"""
+        url_lower = url.lower()
+        return any(ad_domain in url_lower for ad_domain in self.ad_domains)
+    def enhance_query(self, query: str) -> str:
+        """增强查询词，添加香港旅游关键词"""
+        if "Hong Kong" not in query:
+            query = f"Hong Kong Tourism{query}"
+        return query
+    @abstractmethod
+    def search(self, query: str) -> List[Dict]:
+        pass
+class GoogleSearch(SearchEngine):
+    def __init__(self, api_key: str, cx: str, proxies: Dict[str, str] = None):
+        super().__init__()
+        self.api_key = api_key
+        self.cx = cx
+        self.base_url = "https://www.googleapis.com/customsearch/v1"
+        self.proxies = proxies or {}
+    def filter_results(self, results: List[Dict]) -> List[Dict]:
+        """过滤搜索结果"""
+        filtered = []
+        for result in results:
+            url = result['url'].lower()
+            # 只过滤广告域名
+            if not self.is_ad_url(url):
+                filtered.append(result)
+        return filtered
+    def search(self, query: str) -> List[Dict]:
+        # 增强查询词
+        enhanced_query = self.enhance_query(query)
+        params = {
+            'key': self.api_key,
+            'cx': self.cx,
+            'q': enhanced_query
+        }
+        response = requests.get(self.base_url, params=params)
+        if response.status_code == 200:
+            results = response.json()
+            return [{
+                'title': item['title'],
+                'snippet': item['snippet'],
+                'url': item['link']
+            } for item in results.get('items', [])]
+        return []
+class BochaSearch(SearchEngine):
+    def __init__(self, api_key: str, base_url: str, proxies: Dict[str, str] = None):
+        super().__init__()
+        self.api_key = api_key
+        self.base_url = base_url.rstrip('/')  # 移除末尾可能的斜杠
+        self.proxies = proxies or {}
+    def search(self, query: str) -> List[Dict]:
+        try:
+            # 增强查询词
+            enhanced_query = self.enhance_query(query)
+            headers = {
+                'Authorization': f'Bearer {self.api_key}',
+                'Content-Type': 'application/json',
+                'Connection': 'keep-alive',
+                'Accept': '*/*'
+            }
+            payload = {
+                'query': enhanced_query,
+                'stream': False  # 使用非流式返回
+            }
+            # 使用正确的端点
+            endpoint = f"{self.base_url}/v1/ai-search"
+            logging.info(f"正在请求博查API...")
+            logging.info(f"增强后的查询词: {enhanced_query}")
+            response = requests.post(
+                endpoint,
+                headers=headers,
+                json=payload,
+                proxies=None
+            )
+            # 详细打印响应信息
+            logging.info(f"API响应状态码: {response.status_code}")
+            logging.info(f"API响应内容: {response.text[:500]}...")  # 只打印前500个字符
+            if response.status_code != 200:
+                logging.error(f"API请求失败，状态码: {response.status_code}")
+                logging.error(f"错误响应: {response.text}")
+                return []
+            response_json = response.json()
+            if response_json.get('code') == 200 and 'messages' in response_json:
+                messages = response_json['messages']
+                if messages and isinstance(messages, list):
+                    for msg in messages:
+                        if msg.get('type') == 'source' and msg.get('content_type') == 'webpage':
+                            try:
+                                content = json.loads(msg['content'])
+                                if 'value' in content:
+                                    return content['value']
+                            except json.JSONDecodeError:
+                                logging.error(f"无法解析消息内容: {msg['content']}")
+                                continue
+            logging.error(f"API返回数据结构异常: {response_json}")
+            return []
+        except Exception as e:
+            logging.error(f"处理API响应时出错: {str(e)}")
+            return []
+    def search_images(self, query: str, count: int = 3) -> List[Dict]:
+        """搜索相关图片"""
+        try:
+            headers = {
+                'Authorization': f'Bearer {self.api_key}',
+                'Content-Type': 'application/json'
+            }
+            # 增强查询词
+            enhanced_query = self.enhance_query(query)
+            logging.info(f"增强后的图片搜索查询: {enhanced_query}")
+            payload = {
+                'query': enhanced_query,
+                'freshness': 'oneYear',
+                'count': 10,  # 搜索更多图片以确保有足够的有效结果
+                'filter': 'images'
+            }
+            endpoint = f"{self.base_url}/v1/web-search"
+            response = requests.post(
+                endpoint,
+                headers=headers,
+                json=payload,
+                timeout=10
+            )
+            if response.status_code == 200:
+                try:
+                    data = response.json()
+                    logging.info(f"API返回数据结构: {data.keys()}")
+                    if data.get('code') == 200 and 'data' in data:
+                        data_content = data['data']
+                        logging.info(f"data字段内容: {data_content.keys()}")
+                        images = []
+                        if 'images' in data_content:
+                            image_items = data_content['images'].get('value', [])
+                            logging.info(f"找到 {len(image_items)} 张图片")
+                            for item in image_items:
+                                # 简化过滤条件，只检查基本必要条件
+                                if (item.get('contentUrl') and
+                                    item.get('width', 0) >= 300 and
+                                    item.get('height', 0) >= 300):
+                                    image_info = {
+                                        'url': item['contentUrl'],
+                                        'width': item['width'],
+                                        'height': item['height']
+                                    }
+                                    images.append(image_info)
+                                    if len(images) >= count:
+                                        break
+                    logging.info(f"最终返回 {len(images)} 张图片")
+                    return images[:count]
+                except json.JSONDecodeError as e:
+                    logging.error(f"JSON解析错误: {str(e)}")
+                    return []
+                except Exception as e:
+                    logging.error(f"处理图片数据时出错: {str(e)}")
+                    return []
+            logging.error(f"API请求失败，状态码: {response.status_code}")
+            return []
+        except Exception as e:
+            logging.error(f"图片搜索出错: {str(e)}")
+            return []
+"""
+class BingSearch(SearchEngine):
+    def __init__(self, api_key: str):
+        super().__init__()
+        self.api_key = api_key
+        self.base_url = "https://api.bing.microsoft.com/v7.0/search"
+    def search(self, query: str) -> List[Dict]:
+        # 只添加香港旅游关键词
+        enhanced_query = f"香港旅游 {query}"
+        headers = {'Ocp-Apim-Subscription-Key': self.api_key}
+        params = {
+            'q': enhanced_query
+        }
+        response = requests.get(
+            self.base_url,
+            headers=headers,
+            params=params
+        )
+        results = response.json()
+        filtered_results = []
+        for item in results.get('webPages', {}).get('value', []):
+            if not self.is_ad_url(item['url']):
+                filtered_results.append({
+                    'title': item['name'],
+                    'snippet': item['snippet'],
+                    'url': item['url']
+                })
+        return filtered_results
+    def is_trusted_domain(self, url: str) -> bool:
+        ""检查是否为可信域名""
+        return any(
+            trusted_domain in url.lower()
+            for trusted_domain in self.config['search_settings']['trusted_domains']
+        )
+"""

src/core/__pycache__/document_processor.cpython-310.pyc ADDED Viewed

Binary file (2.4 kB). View file

src/core/__pycache__/embeddings.cpython-310.pyc ADDED Viewed

Binary file (1.77 kB). View file

src/core/__pycache__/html_processor.cpython-310.pyc ADDED Viewed

Binary file (5.69 kB). View file

src/core/__pycache__/plan_generator.cpython-310.pyc ADDED Viewed

Binary file (3.5 kB). View file

src/core/__pycache__/ranking.cpython-310.pyc ADDED Viewed

Binary file (3.69 kB). View file

src/core/__pycache__/reranker.cpython-310.pyc ADDED Viewed

Binary file (1.87 kB). View file

src/core/_init_.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+Core package initialization.
+This module contains the core functionality for the travel RAG system.
+"""
+from .document_processor import DocumentProcessor
+from .ranking import RankingSystem
+from .plan_generator import PlanGenerator
+__all__ = [
+    'DocumentProcessor',
+    'RankingSystem',
+    'PlanGenerator'
+]

src/core/document_processor.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import List, Dict
+from src.core.html_processor import HTMLProcessor
+from src.api.llm_api import LLMInterface
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+class DocumentProcessor:
+    def __init__(self, llm: LLMInterface):
+        self.html_processor = HTMLProcessor()
+        self.llm = llm
+        # 添加缓存
+        self.cache = {}
+    def _clean_text(self, text: str) -> str:
+        """清理文本内容"""
+        import re
+        # 移除多余空白
+        text = re.sub(r'\s+', ' ', text)
+        # 移除特殊字符
+        text = re.sub(r'[^\w\s\u4e00-\u9fff。，！？、]', '', text)
+        return text.strip()
+    def process_documents(self, search_results: List[Dict]) -> List[Dict]:
+        processed_docs = []
+        batch_size = 5  # 批处理大小
+        # 并行处理文档
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            futures = []
+            for result in search_results:
+                if result['url'] in self.cache:
+                    processed_docs.append(self.cache[result['url']])
+                    continue
+                futures.append(
+                    executor.submit(self._process_single_doc, result)
+                )
+            for future in as_completed(futures):
+                try:
+                    doc = future.result()
+                    if doc:
+                        self.cache[doc['url']] = doc
+                        processed_docs.append(doc)
+                except Exception as e:
+                    logging.error(f"处理文档失败: {str(e)}")
+        return processed_docs[:5]  # 限制返回数量
+    def _process_single_doc(self, result: Dict) -> Dict:
+        try:
+            # 提取HTML内容
+            html = self.html_processor.fetch_html(result['url'])
+            if not html:
+                return None
+            # 提取主要内容
+            content = self.html_processor.extract_main_content(html)
+            content = self._clean_text(content)
+            if len(content) < 100:  # 内容太短
+                return None
+            # 生成更有针对性的总结
+            summary = self.llm.summarize_document(
+                content=content,
+                title=result.get('title', ''),
+                url=result['url']
+            )
+            if summary:
+                return {
+                    'passage': summary,
+                    'title': result.get('title', ''),
+                    'url': result['url']
+                }
+        except Exception as e:
+            logging.error(f"处理文档失败 ({result['url']}): {str(e)}")
+            return None

src/core/embeddings.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from sentence_transformers import SentenceTransformer
+import torch
+import numpy as np
+import os
+import logging
+class EmbeddingModel:
+    def __init__(self, model_name="BAAI/bge-m3"):
+        try:
+            # 使用 Hugging Face 模型 ID
+            self.model = SentenceTransformer(model_name)
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.model.to(self.device)
+            logging.info(f"成功加载嵌入模型 {model_name} 到 {self.device} 设备")
+        except Exception as e:
+            logging.error(f"加载模型失败: {str(e)}")
+            raise
+    def encode(self, texts, batch_size=32):
+        """
+        将文本转换为向量表示
+        """
+        embeddings = self.model.encode(
+            texts,
+            batch_size=batch_size,
+            show_progress_bar=True,
+            normalize_embeddings=True
+        )
+        return embeddings
+    def encode_queries(self, queries):
+        """
+        为查询文本添加特殊前缀并编码
+        BGE模型推荐在查询前添加"Represent this sentence for searching relevant passages: "
+        """
+        prefix = "Represent this sentence for searching relevant passages: "
+        if isinstance(queries, str):
+            queries = [queries]
+        prefixed_queries = [prefix + query for query in queries]
+        return self.encode(prefixed_queries)

src/core/html_processor.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import requests
+from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Dict
+import time
+import logging
+class HTMLProcessor:
+    def __init__(self, timeout: int = 5):
+        self.session = requests.Session()
+        self.timeout = timeout
+    def fetch_html(self, url: str) -> str:
+        """获取单个URL的HTML内容"""
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive'
+        }
+        try:
+            logging.info(f"开始获取URL: {url}")
+            response = self.session.get(
+                url,
+                timeout=self.timeout,
+                headers=headers,
+                verify=False
+            )
+            response.raise_for_status()
+            # 检查响应内容类型
+            content_type = response.headers.get('content-type', '')
+            if 'text/html' not in content_type.lower():
+                logging.warning(f"非HTML响应: {content_type}")
+            # 设置正确的编码
+            response.encoding = response.apparent_encoding
+            html = response.text
+            logging.info(f"成功获取HTML，长度: {len(html)}")
+            return html
+        except requests.Timeout:
+            logging.error(f"获取URL超时: {url}")
+        except requests.RequestException as e:
+            logging.error(f"获取URL失败 {url}: {str(e)}")
+        except Exception as e:
+            logging.error(f"未预期的错误 {url}: {str(e)}")
+        return ""
+    def fetch_multiple_html(self, urls: List[str], max_urls: int = 10) -> List[Dict]:
+        """
+        并行获取多个URL的HTML内容
+        Args:
+            urls: URL列表
+            max_urls: 最大获取数量
+        Returns:
+            List[Dict]: 包含成功获取的HTML内容列表
+        """
+        results = []
+        urls = urls[:max_urls]  # 只处理前max_urls个URL
+        with ThreadPoolExecutor(max_workers=max_urls) as executor:
+            # 提交所有任务
+            future_to_url = {
+                executor.submit(self.fetch_html, url): url
+                for url in urls
+            }
+            # 处理完成的任务
+            for future in as_completed(future_to_url):
+                url = future_to_url[future]
+                try:
+                    html = future.result()
+                    if html:  # 只添加成功获取的结果
+                        results.append({
+                            'url': url,
+                            'html': html,
+                            'metadata': self.extract_metadata(html)
+                        })
+                except Exception as e:
+                    print(f"处理URL失败 {url}: {e}")
+        return results
+    def extract_main_content(self, html: str) -> str:
+        """提取HTML中的主要内容"""
+        if not html:
+            logging.warning("输入的HTML为空")
+            return ""
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+            # 移除脚本和样式元素
+            for script in soup(["script", "style", "iframe", "nav", "footer", "header"]):
+                script.decompose()
+            # 记录原始长度
+            original_length = len(html)
+            # 尝试找到主要内容容器
+            main_content = None
+            possible_content_ids = ['content', 'main', 'article', 'post']
+            possible_content_classes = ['content', 'article', 'post', 'main-content']
+            # 按ID查找
+            for content_id in possible_content_ids:
+                main_content = soup.find(id=content_id)
+                if main_content:
+                    break
+            # 按class查找
+            if not main_content:
+                for content_class in possible_content_classes:
+                    main_content = soup.find(class_=content_class)
+                    if main_content:
+                        break
+            # 如果找不到特定容器，使用全文
+            text = main_content.get_text() if main_content else soup.get_text()
+            # 清理文本
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = '\n'.join(chunk for chunk in chunks if chunk)
+            # 记���处理后长度
+            processed_length = len(text)
+            # 添加日志
+            logging.info(f"HTML处理前长度: {original_length}, 处理后长度: {processed_length}")
+            # 如果处理后文本太短，可能是提取失败
+            if processed_length < 100 and original_length > 1000:
+                logging.warning(f"提取的内容异常短: {processed_length} 字符")
+                return ""
+            return text
+        except Exception as e:
+            logging.error(f"提取主要内容时出错: {str(e)}")
+            return ""
+    def extract_metadata(self, html: str) -> dict:
+        """提取HTML中的元数据"""
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+            metadata = {
+                'title': '',
+                'description': '',
+                'keywords': ''
+            }
+            # 更安全的标题提取
+            title = ''
+            if soup.title and soup.title.string:
+                title = soup.title.string.strip()
+            else:
+                # 尝试从h1标签提取标题
+                h1 = soup.find('h1')
+                if h1:
+                    title = h1.get_text().strip()
+            # 如果还是没有标题，使用默认值
+            metadata['title'] = title if title else "未知标题"
+            # 提取meta描述
+            meta_desc = soup.find('meta', attrs={'name': ['description', 'Description']})
+            if meta_desc:
+                metadata['description'] = meta_desc.get('content', '').strip()
+            # 提取meta关键词
+            meta_keywords = soup.find('meta', attrs={'name': ['keywords', 'Keywords']})
+            if meta_keywords:
+                metadata['keywords'] = meta_keywords.get('content', '').strip()
+            # 确保所有字段都有值
+            metadata = {k: v if v else '未知' for k, v in metadata.items()}
+            return metadata
+        except Exception as e:
+            logging.error(f"提取元数据时出错: {str(e)}")
+            return {
+                'title': '未知标题',
+                'description': '未知描述',
+                'keywords': '未知关键词'
+            }

src/core/plan_generator.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from typing import List, Dict
+from src.api.llm_api import LLMInterface
+import logging
+class PlanGenerator:
+    def __init__(self, llm: LLMInterface):
+        self.llm = llm
+    def generate_plan(self, query: str, context: List[Dict]) -> Dict:
+        """生成旅行计划"""
+        # 确保查询包含香港关键词
+        if "Hong Kong" not in query:
+            query = f"Hong Kong Tourism {query}"
+        logging.info(f"Generating plan for query: {query}")
+        logging.info(f"Number of reference documents: {len(context)}")
+        # 构建提示词
+        prompt = self._build_prompt(query, context)
+        # 生成计划
+        plan = self.llm.generate_travel_plan(query, context)
+        # 记录来源
+        sources = []
+        for doc in context:
+            if doc.get('url'):
+                sources.append({
+                    'url': doc['url'],
+                    'title': doc.get('title', 'Unknown Title'),
+                    'relevance_score': doc.get('relevance_score', 0)
+                })
+        return {
+            'query': query,
+            'plan': plan,
+            'sources': sources
+        }
+    def _build_prompt(self, query: str, context: List[Dict]) -> str:
+        """构建提示词"""
+        # 提取查询中的关键信息
+        days = self._extract_days(query)
+        prompt = f"""Please create a detailed Hong Kong travel plan based on the following information.
+User Needs: {query}
+Reference Materials:
+"""
+        # 添加上下文信息
+        for i, doc in enumerate(context, 1):
+            prompt += f"\nSource {i}:\n{doc['passage']}\n"
+        prompt += f"""
+Please provide a detailed itinerary for a {days}-day trip to Hong Kong, including the following content:
+1. Itinerary Overview:
+   - Overall itinerary arrangement
+   - Key attractions introduction
+   - Time allocation suggestions
+2. Daily detailed itinerary:
+   - Morning activities and attractions
+   - Afternoon activities and attractions
+   - Evening activities and attractions
+   - Specific time allocation
+   - Traffic suggestions
+3. Traffic Suggestions:
+   - Return transportation plan
+   - City transportation suggestions
+   - Traffic card purchase suggestions
+   - Practical traffic APP recommendations
+4. Accommodation Recommendations:
+   - Recommended area
+   - Specific hotel suggestions
+   - Booking considerations
+5. Food Recommendations:
+   - Specialty restaurants
+   - Recommended restaurants
+   - Snack street recommendations
+6. Practical Tips:
+   - Weather suggestions
+   - Clothing suggestions
+   - Essential items
+   - Considerations
+   - Consumer budget
+Please ensure:
+1. The itinerary is reasonable, considering the distance between attractions and the time for visiting
+2. Provide specific time allocation
+3. Include practical local suggestions
+4. Consider actual conditions (e.g., traffic time, attraction opening hours, etc.)
+5. Provide detailed traffic guidance
+Please return the travel plan content directly, without any other explanation."""
+        return prompt
+    def _extract_days(self, query: str) -> int:
+        """从查询中提取天数"""
+        import re
+        # 匹配常见的天数表达方式
+        patterns = [
+            r'(\d+)\s*[天日]',
+            r'(\d+)\s*-*\s*days?',
+            r'(\d+)\s*-*\s*d'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, query.lower())
+            if match:
+                return int(match.group(1))
+        # 默认返回3天
+        return 3

src/core/ranking.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import faiss
+import numpy as np
+from typing import List, Dict
+from .embeddings import EmbeddingModel
+from .reranker import Reranker
+from sentence_transformers import SentenceTransformer
+import logging
+from sklearn.metrics.pairwise import cosine_similarity
+import torch
+class RankingSystem:
+    def __init__(self,
+                 embedding_model: EmbeddingModel = None,
+                 reranker: Reranker = None):
+        self.embedding_model = embedding_model or EmbeddingModel()
+        self.reranker = reranker or Reranker()
+        self.index = None
+        self.passages = None
+        self.embedding_cache = {}
+    def build_index(self, passages: List[Dict]):
+        """构建FAISS索引"""
+        self.passages = passages
+        texts = [p['passage'] for p in passages]
+        if not texts:
+            logging.warning("没有文本需要编码")
+            return
+        embeddings = self.embedding_model.encode(texts)
+        if embeddings is None or not hasattr(embeddings, 'shape'):
+            logging.error("编码结果为空或格式不正确")
+            return
+        dimension = embeddings.shape[1]
+        self.index = faiss.IndexFlatIP(dimension)
+        self.index.add(embeddings.astype('float32'))
+    def initial_ranking(self, query: str, passages: List[Dict], initial_top_k: int = 10) -> List[Dict]:
+        """对文档进行初始排序并返回前K个结果"""
+        # 确保输入格式正确
+        if not isinstance(passages[0], dict):
+            passages = [{'passage': p} for p in passages]
+        # 使用缓存的嵌入
+        texts = [p['passage'] for p in passages]
+        embeddings = []
+        for text in texts:
+            if text in self.embedding_cache:
+                embeddings.append(self.embedding_cache[text])
+            else:
+                embedding = self.embedding_model.encode([text])[0]
+                self.embedding_cache[text] = embedding
+                embeddings.append(embedding)
+        embeddings = np.array(embeddings)
+        # 批量计算相似度
+        query_embedding = self.embedding_model.encode([query])[0]
+        similarities = np.dot(embeddings, query_embedding)
+        # 快速排序
+        indices = np.argsort(similarities)[::-1][:initial_top_k]
+        ranked_passages = []
+        for idx in indices:
+            passage = passages[idx].copy()
+            passage['retrieval_score'] = float(similarities[idx])
+            ranked_passages.append(passage)
+        return ranked_passages
+    def rerank(self, query: str, initial_ranked: List[Dict], final_top_k: int = 3) -> List[Dict]:
+        """使用重排序器进行重排序"""
+        # 使用重排序器
+        reranked = self.reranker.rerank(query, initial_ranked)
+        # 计算最终分数（调整权重）
+        for passage in reranked:
+            # 增加相关性权重
+            passage['final_score'] = (
+                0.3 * passage['retrieval_score'] +
+                0.7 * passage['rerank_score']
+            )
+        # 按最终分数排序
+        final_ranked = sorted(
+            reranked,
+            key=lambda x: x['final_score'],
+            reverse=True
+        )
+        return final_ranked[:final_top_k]
+    def retrieve(self, query: str, passages: List[Dict]) -> List[Dict]:
+        """
+        检索并排序文档
+        Args:
+            query: 查询字符串
+            passages: 待检索的文档列表
+        Returns:
+            List[Dict]: 经过排序的文档列表
+        """
+        # 1. 首先进行初始排序
+        initial_results = self.initial_ranking(query, passages, initial_top_k=10)
+        # 2. 然后进行重排序
+        final_results = self.rerank(query, initial_results, final_top_k=3)
+        return final_results

src/core/reranker.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from typing import List, Dict
+from FlagEmbedding import FlagReranker
+import logging
+import torch
+import os
+from sentence_transformers import CrossEncoder
+class Reranker:
+    def __init__(self, model_path="BAAI/bge-reranker-large"):
+        try:
+            self.model = FlagReranker(
+                model_path,
+                use_fp16=True,
+                device="cuda" if torch.cuda.is_available() else "cpu"
+            )
+            logging.info(f"成功加载重排序模型 {model_path} 到 {'cuda' if torch.cuda.is_available() else 'cpu'} 设备")
+        except Exception as e:
+            logging.error(f"加载重排序模型失败: {str(e)}")
+            raise
+    def rerank(self, query: str, passages: List[Dict]) -> List[Dict]:
+        """
+        对文档进行重排序
+        """
+        try:
+            # 准备文本列表
+            texts = [p['passage'] for p in passages]
+            # 执行重排序
+            scores = self.model.compute_score([[query, text] for text in texts])
+            # 将分数添加到原始字典中
+            for passage, score in zip(passages, scores):
+                passage['rerank_score'] = float(score)
+            # 按重排序分数排序
+            reranked = sorted(passages, key=lambda x: x['rerank_score'], reverse=True)
+            return reranked
+        except Exception as e:
+            logging.error(f"重排序过程中出错: {str(e)}")
+            # 如果重排序失败，返回原始排序
+            return passages

src/retrieval/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (842 Bytes). View file

src/retrieval/__pycache__/graph_rag.cpython-310.pyc ADDED Viewed

Binary file (2.48 kB). View file

src/retrieval/__pycache__/memo_rag.cpython-310.pyc ADDED Viewed

Binary file (2.62 kB). View file

src/retrieval/base.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from abc import ABC, abstractmethod
+from typing import List, Dict
+class BaseRetriever(ABC):
+    """检索策略的基类"""
+    @abstractmethod
+    def retrieve(self, query: str, context: List[Dict]) -> List[Dict]:
+        """执行检索"""
+        pass
+    @abstractmethod
+    def init_retriever(self, config: Dict):
+        """初始化检索器"""
+        pass

src/retrieval/graph_rag.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from .base import BaseRetriever
+from typing import Dict, List
+import networkx as nx
+# import spacy  # 暂时注释掉
+from sklearn.cluster import AgglomerativeClustering
+import numpy as np
+import logging
+class GraphRAG(BaseRetriever):
+    def __init__(self, config: Dict):
+        self.config = config
+        self.graph = nx.Graph()
+        # self.nlp = spacy.load("zh_core_web_sm")  # 暂时注释掉
+        self.init_retriever(config)
+    def init_retriever(self, config: Dict):
+        self.working_dir = config['retrieval_settings']['methods'][2]['model_settings']['working_dir']
+        self.graph_file = f"{self.working_dir}/graph.graphml"
+    def retrieve(self, query: str, context: List[Dict]) -> List[Dict]:
+        # 简单实现:基于关键词匹配的检索
+        scored_docs = []
+        for doc in context:
+            # 简单计算query中的词在文档中出现的次数作为分数
+            score = sum(1 for word in query.split() if word in doc['passage'])
+            doc_copy = doc.copy()
+            doc_copy['graph_score'] = float(score)
+            scored_docs.append(doc_copy)
+        return sorted(scored_docs, key=lambda x: x['graph_score'], reverse=True)
+    def _build_graph(self, context: List[Dict]):
+        """简化版本的图构建"""
+        # 仅使用简单的词频统计
+        for doc in context:
+            text = doc['passage']
+            words = text.split()
+            # 相邻词之间建立边
+            for i in range(len(words)-1):
+                w1, w2 = words[i], words[i+1]
+                if not self.graph.has_edge(w1, w2):
+                    self.graph.add_edge(w1, w2, weight=1)
+                else:
+                    self.graph[w1][w2]['weight'] += 1
+    def _calculate_graph_score(self, query_words: List[str], doc: Dict) -> float:
+        """简化版本的图分数计算"""
+        score = 0.0
+        doc_words = doc['passage'].split()
+        for q_word in query_words:
+            for d_word in doc_words:
+                if self.graph.has_edge(q_word, d_word):
+                    score += self.graph[q_word][d_word]['weight']
+        return score if score > 0 else 0.0

src/retrieval/memo_rag.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from .base import BaseRetriever
+from typing import Dict, List
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from sentence_transformers import SentenceTransformer
+import logging
+import os
+class MemoRAG(BaseRetriever):
+    def __init__(self, config: Dict):
+        self.config = config
+        self.init_retriever(config)
+    def init_retriever(self, config: Dict):
+        memo_config = config['retrieval_settings']['methods'][1]['model_settings']
+        try:
+            # 使用本地量化模型
+            local_model_path = "/root/.cache/modelscope/hub/MaxLeton13/chatglm3-6B-32k-int4"
+            logging.info(f"加载本地模型: {local_model_path}")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                local_model_path,
+                device_map="auto",
+                trust_remote_code=True
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                local_model_path,
+                trust_remote_code=True
+            )
+            # 初始化向量检索模型
+            logging.info(f"加载向量检索模型: {memo_config['ret_model']}")
+            self.embedding_model = SentenceTransformer(
+                memo_config['ret_model'],
+                device="cuda" if torch.cuda.is_available() else "cpu"
+            )
+            # 设置缓存目录
+            self.cache_dir = memo_config['cache_dir']
+            os.makedirs(self.cache_dir, exist_ok=True)
+        except Exception as e:
+            logging.error(f"初始化MemoRAG失败: {str(e)}")
+            raise
+    def retrieve(self, query: str, context: List[Dict]) -> List[Dict]:
+        try:
+            # 使用向量检索进行初步筛选
+            query_embedding = self.embedding_model.encode(query)
+            # 计算文档嵌入
+            docs_text = [doc['passage'] for doc in context]
+            docs_embeddings = self.embedding_model.encode(docs_text)
+            # 计算相似度
+            similarities = torch.nn.functional.cosine_similarity(
+                torch.tensor(query_embedding).unsqueeze(0),
+                torch.tensor(docs_embeddings),
+                dim=1
+            )
+            # 为每个文档添加分数
+            scored_docs = []
+            for doc, score in zip(context, similarities):
+                doc_copy = doc.copy()
+                doc_copy['memory_score'] = float(score)
+                scored_docs.append(doc_copy)
+            # 按分数排序
+            return sorted(scored_docs, key=lambda x: x['memory_score'], reverse=True)
+        except Exception as e:
+            logging.error(f"MemoRAG检索失败: {str(e)}")
+            # 如果检索失败，返回原始文档列表
+            return context

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .helpers import load_config, setup_logging
+__all__ = [
+    'load_config',
+    'setup_logging',
+]

src/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (246 Bytes). View file

src/utils/__pycache__/helpers.cpython-310.pyc ADDED Viewed

Binary file (3 kB). View file

src/utils/helpers.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import logging
+import yaml
+from pathlib import Path
+from typing import Dict, Any, Tuple
+def load_config(config_path: str = "/root/travel_rag/config/config.yaml") -> Dict[str, Any]:
+    """
+    从配置文件加载配置
+    Args:
+        config_path: 配置文件路径，默认为 "/root/travel_rag/config.yaml"
+    Returns:
+        配置字典
+    """
+    config_path = Path(config_path)
+    if not config_path.exists():
+        raise FileNotFoundError(f"配置文件未找到: {config_path}")
+    with open(config_path, 'r', encoding='utf-8') as f:
+        return yaml.safe_load(f)
+def setup_logging(
+    log_level: str = "INFO",
+    log_file: str = None
+) -> None:
+    """
+    设置日志配置
+    Args:
+        log_level: 日志级别，默认为 "INFO"
+        log_file: 日志文件路径，默认为 None（仅控制台输出）
+    """
+    # 设置日志格式
+    log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    # 配置根日志记录器
+    logging.basicConfig(
+        level=getattr(logging, log_level.upper()),
+        format=log_format
+    )
+    # 如果指定了日志文件，添加文件处理器
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(logging.Formatter(log_format))
+        logging.getLogger().addHandler(file_handler)
+def setup_proxy(proxy_config_path: str = "/root/clash/config.yaml") -> Tuple[Dict[str, str], bool]:
+    """
+    设置系统代理并返回代理配置和代理可用性状态
+    Args:
+        proxy_config_path: 代理配置文件路径（应为字符串类型）
+    Returns:
+        Tuple[Dict[str, str], bool]: (代理配置字典, 代理是否可用)
+    """
+    import os
+    import logging
+    import requests
+    from requests.exceptions import RequestException
+    logger = logging.getLogger(__name__)
+    # 设置默认代理地址
+    proxy_url = 'http://127.0.0.1:8880'
+    # 如果存在配置文件，从配置文件读取
+    if os.path.exists(proxy_config_path):
+        try:
+            config = load_config(proxy_config_path)
+            # 根据实际配置文件结构调整
+            proxy_url = config.get('proxy_url', proxy_url)
+            logger.info(f"已从配置文件加载代理设置: {proxy_url}")
+        except Exception as e:
+            logger.warning(f"加载代理配置失败: {e}，使用默认配置")
+    # 设置环境变量
+    os.environ['HTTP_PROXY'] = proxy_url
+    os.environ['HTTPS_PROXY'] = proxy_url
+    proxies = {
+        'http': proxy_url,
+        'https': proxy_url
+    }
+    # 测试代理是否可用
+    try:
+        response = requests.get('https://www.google.com',
+                              proxies=proxies,
+                              timeout=5,
+                              verify=False)  # 添加 verify=False 避免证书问题
+        proxy_available = response.status_code == 200
+        if proxy_available:
+            logger.info("代理服务器可用")
+        else:
+            logger.warning(f"代理服务器响应异常，状态码: {response.status_code}")
+    except RequestException as e:
+        logger.warning(f"代理服务器连接失败: {e}")
+        proxy_available = False
+    logger.info(f"代理设置完成: {proxies}, 可用状态: {proxy_available}")
+    return proxies, proxy_available

src/utils/neo4j_helper.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from neo4j import GraphDatabase
+from typing import List, Dict, Any
+import logging
+class Neo4jConnection:
+    def __init__(self, uri: str = "bolt://localhost:7687",
+                 user: str = "neo4j",
+                 password: str = "your_password"):
+        """初始化 Neo4j 连接"""
+        try:
+            self.driver = GraphDatabase.driver(uri, auth=(user, password))
+            logging.info("Neo4j 连接成功")
+        except Exception as e:
+            logging.error(f"Neo4j 连接失败: {str(e)}")
+            raise
+    def close(self):
+        """关闭连接"""
+        if self.driver:
+            self.driver.close()
+    def run_query(self, query: str, parameters: Dict[str, Any] = None) -> List[Dict]:
+        """执行 Cypher 查询"""
+        try:
+            with self.driver.session() as session:
+                result = session.run(query, parameters or {})
+                return [record.data() for record in result]
+        except Exception as e:
+            logging.error(f"执行查询失败: {str(e)}")
+            raise
+    def create_node(self, label: str, properties: Dict[str, Any]) -> Dict:
+        """创建节点"""
+        query = f"""
+        CREATE (n:{label} $properties)
+        RETURN n
+        """
+        return self.run_query(query, {"properties": properties})
+    def create_relationship(self, start_node_label: str, start_node_props: Dict,
+                          end_node_label: str, end_node_props: Dict,
+                          relationship_type: str, relationship_props: Dict = None) -> Dict:
+        """创建关系"""
+        query = f"""
+        MATCH (a:{start_node_label}), (b:{end_node_label})
+        WHERE a.id = $start_props.id AND b.id = $end_props.id
+        CREATE (a)-[r:{relationship_type} $rel_props]->(b)
+        RETURN a, r, b
+        """
+        params = {
+            "start_props": start_node_props,
+            "end_props": end_node_props,
+            "rel_props": relationship_props or {}
+        }
+        return self.run_query(query, params)
+    def get_node(self, label: str, properties: Dict[str, Any]) -> Dict:
+        """获取节点"""
+        query = f"""
+        MATCH (n:{label})
+        WHERE n.id = $properties.id
+        RETURN n
+        """
+        return self.run_query(query, {"properties": properties})
+# 使用示例：
+if __name__ == "__main__":
+    # 创建连接
+    neo4j = Neo4jConnection()
+    try:
+        # 创建示例节点
+        node = neo4j.create_node("Place", {
+            "id": "1",
+            "name": "香港迪士尼乐园",
+            "type": "景点"
+        })
+        print("创建节点成功:", node)
+    finally:
+        # 关闭连接
+        neo4j.close()