Spaces:

peterpanbk95
/

viet-speech-app

Running

App Files Files Community

peterpanbk95 commited on Apr 13

Commit

122fe8e

verified ·

1 Parent(s): e88cbc5

Update app.py

Browse files

Files changed (1) hide show

app.py +1234 -240

app.py CHANGED Viewed

@@ -6,25 +6,27 @@ import threading
 import json
 import base64
 import io
 import random
 import logging
 from queue import Queue
 from threading import Thread
 import gradio as gr
 import torch
-import librosa
 import soundfile as sf
 import requests
-import numpy as np
-from scipy import signal
 from transformers import pipeline, AutoTokenizer, AutoModel
-# Thiết lập logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Tạo các thư mục cần thiết
 os.makedirs("data", exist_ok=True)
 os.makedirs("data/audio", exist_ok=True)
 os.makedirs("data/reports", exist_ok=True)
@@ -32,7 +34,8 @@ os.makedirs("data/models", exist_ok=True)
 class AsyncProcessor:
-    """Xử lý các tác vụ nặng trong thread riêng để không làm 'đơ' giao diện."""
     def __init__(self):
         self.task_queue = Queue()
         self.result_queue = Queue()
@@ -70,314 +73,1305 @@ class AsyncProcessor:
 class VietSpeechTrainer:
     def __init__(self):
-        # Đọc cấu hình từ file config.json và từ biến môi trường
         self.config = self._load_config()
         # Khởi tạo bộ xử lý bất đồng bộ
         self.async_processor = AsyncProcessor()
-        # Lưu trữ lịch sử phiên làm việc
         self.session_history = []
         self.current_session_id = int(time.time())
-        # Các biến trạng thái hội thoại
         self.current_scenario = None
         self.current_prompt_index = 0
-        # Khởi tạo các mô hình (STT, TTS và phân tích LLM)
         logger.info("Đang tải các mô hình...")
         self._initialize_models()
     def _load_config(self):
-        """Đọc file config.json và cập nhật từ biến môi trường (Secrets khi deploy)"""
         config = {
-            "stt_model": "nguyenvulebinh/wav2vec2-base-vietnamese-250h",
-            "use_phowhisper": False,
-            "use_phobert": False,
-            "use_vncorenlp": False,
-            "llm_provider": "none",  # openai, gemini, local hoặc none
-            "openai_api_key": "",
-            "gemini_api_key": "",
-            "local_llm_endpoint": "",
-            "use_viettts": False,
-            "default_dialect": "Bắc",
-            "enable_pronunciation_eval": False,
-            "preprocess_audio": True,
-            "save_history": True,
-            "enable_english_tts": False
         }
         if os.path.exists("config.json"):
             try:
                 with open("config.json", "r", encoding="utf-8") as f:
                     file_config = json.load(f)
                     config.update(file_config)
             except Exception as e:
-                logger.error(f"Lỗi đọc config.json: {e}")
-        # Cập nhật từ biến môi trường
-        if os.environ.get("LLM_PROVIDER"):
-            config["llm_provider"] = os.environ.get("LLM_PROVIDER").lower()
-        if os.environ.get("OPENAI_API_KEY"):
-            config["openai_api_key"] = os.environ.get("OPENAI_API_KEY")
-        if os.environ.get("GEMINI_API_KEY"):
-            config["gemini_api_key"] = os.environ.get("GEMINI_API_KEY")
-        if os.environ.get("LOCAL_LLM_ENDPOINT"):
-            config["local_llm_endpoint"] = os.environ.get("LOCAL_LLM_ENDPOINT")
-        if os.environ.get("ENABLE_ENGLISH_TTS") and os.environ.get("ENABLE_ENGLISH_TTS").lower() == "true":
-            config["enable_english_tts"] = True
         return config
     def _initialize_models(self):
-        """Khởi tạo mô hình STT và thiết lập CSM cho TTS tiếng Anh nếu được bật."""
         try:
-            # Khởi tạo STT
             if self.config["use_phowhisper"]:
-                logger.info("Loading PhoWhisper...")
-                self.stt_model = pipeline("automatic-speech-recognition",
-                                          model="vinai/PhoWhisper-small",
-                                          device=0 if torch.cuda.is_available() else -1)
             else:
-                logger.info(f"Loading STT model: {self.config['stt_model']}")
-                self.stt_model = pipeline("automatic-speech-recognition",
-                                          model=self.config["stt_model"],
-                                          device=0 if torch.cuda.is_available() else -1)
         except Exception as e:
-            logger.error(f"Lỗi khởi tạo STT: {e}")
-            self.stt_model = None
-        # Các mô hình NLP (PhoBERT, VnCoreNLP) nếu cần.
-        # ...
-        # Nếu bật TTS tiếng Anh thì thiết lập CSM
-        if self.config.get("enable_english_tts", False):
-            self._setup_csm()
-        else:
-            self.csm_ready = False
-    def _setup_csm(self):
-        """Cài đặt mô hình CSM (Conversational Speech Generation Model) cho TTS tiếng Anh."""
         try:
-            csm_dir = os.path.join(os.getcwd(), "csm")
-            if not os.path.exists(csm_dir):
-                logger.info("Cloning CSM repo...")
-                subprocess.run(["git", "clone", "https://github.com/SesameAILabs/csm", csm_dir], check=True)
-            logger.info("Installing CSM requirements...")
-            subprocess.run(["pip", "install", "-r", os.path.join(csm_dir, "requirements.txt")], check=True)
-            self.csm_ready = True
-            logger.info("CSM đã được thiết lập thành công!")
         except Exception as e:
-            logger.error(f"Failed to set up CSM: {e}")
-            self.csm_ready = False
-    def text_to_speech(self, text, language="vi", dialect="Bắc"):
-        """
-        Chuyển văn bản thành giọng nói:
-         - Nếu language == "en": sử dụng CSM để tạo TTS tiếng Anh.
-         - Nếu language == "vi": sử dụng API hoặc logic TTS tiếng Việt.
-        """
-        if language == "en":
-            if not self.csm_ready:
-                logger.error("CSM chưa được thiết lập hoặc không được bật.")
-                return None
-            output_file = f"data/audio/csm_{int(time.time())}.wav"
-            csm_script_path = os.path.join(os.getcwd(), "csm", "run_csm.py")
-            cmd = [
-                "python",
-                csm_script_path,
-                "--text", text,
-                "--speaker_id", "0",  # Mặc định, có thể cho phép người dùng chọn
-                "--output", output_file
-            ]
-            try:
-                subprocess.run(cmd, check=True)
-                return output_file
-            except subprocess.CalledProcessError as e:
-                logger.error(f"CSM generation failed: {e}")
-                return None
-        else:
-            # Ví dụ: Nếu có API TTS tiếng Việt, gọi API đó.
-            tts_api_url = self.config.get("tts_api_url", "")
-            if tts_api_url:
-                try:
-                    resp = requests.post(tts_api_url, json={"text": text, "dialect": dialect.lower()})
-                    if resp.status_code == 200:
-                        output_file = f"data/audio/tts_{int(time.time())}.wav"
-                        with open(output_file, "wb") as f:
-                            f.write(resp.content)
-                        return output_file
-                    else:
-                        logger.error(f"Error calling TTS API: {resp.text}")
-                        return None
-                except Exception as e:
-                    logger.error(f"Lỗi gọi TTS API: {e}")
-                    return None
-            else:
-                # Nếu không có API TTS, bạn có thể tích hợp VietTTS hoặc khác.
-                return None
     def transcribe_audio(self, audio_path):
-        """Chuyển đổi giọng nói thành văn bản (STT)."""
-        if not self.stt_model:
-            return "STT model not available."
         try:
             result = self.stt_model(audio_path)
             if isinstance(result, dict) and "text" in result:
-                return result["text"]
             elif isinstance(result, list):
-                return " ".join([chunk.get("text", "") for chunk in result])
             else:
-                return str(result)
         except Exception as e:
-            logger.error(f"Lỗi chuyển giọng nói: {e}")
             return f"Lỗi: {str(e)}"
     def analyze_text(self, transcript, dialect="Bắc"):
-        """
-        Phân tích văn bản sử dụng LLM:
-         - Nếu LLM_PROVIDER là "openai", "gemini" hay "local" thì gọi API tương ứng.
-         - Nếu LLM_PROVIDER là "none", sử dụng phân tích rule-based.
-        """
         llm_provider = self.config["llm_provider"]
         if llm_provider == "openai" and self.config["openai_api_key"]:
-            return self._analyze_with_openai(transcript)
         elif llm_provider == "gemini" and self.config["gemini_api_key"]:
-            return self._analyze_with_gemini(transcript)
         elif llm_provider == "local" and self.config["local_llm_endpoint"]:
-            return self._analyze_with_local_llm(transcript)
         else:
-            return self._rule_based_analysis(transcript, dialect)
-    def _analyze_with_openai(self, transcript):
-        headers = {
-            "Authorization": f"Bearer {self.config['openai_api_key']}",
-            "Content-Type": "application/json"
-        }
-        data = {
-            "model": "gpt-3.5-turbo",
-            "messages": [
-                {"role": "system", "content": "Bạn là trợ lý dạy tiếng Việt."},
-                {"role": "user", "content": transcript}
-            ],
-            "temperature": 0.5,
-            "max_tokens": 150
-        }
         try:
-            response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data)
             if response.status_code == 200:
                 result = response.json()
-                return result["choices"][0]["message"]["content"]
             else:
-                return "Lỗi khi gọi OpenAI API."
         except Exception as e:
-            logger.error(f"Lỗi OpenAI: {e}")
-            return "Lỗi phân tích với OpenAI."
-    def _analyze_with_gemini(self, transcript):
-        # Ví dụ minh họa: Gọi Gemini API (chi tiết phụ thuộc vào tài liệu của Gemini)
-        return "Gemini analysis..."
-    def _analyze_with_local_llm(self, transcript):
-        # Giả sử gọi một endpoint local (nếu có) cho LLM cục bộ.
-        headers = {"Content-Type": "application/json"}
-        data = {
-            "model": "local-model",
-            "messages": [
-                {"role": "system", "content": "Bạn là trợ lý dạy tiếng Việt."},
-                {"role": "user", "content": transcript}
-            ],
-            "temperature": 0.5,
-            "max_tokens": 150
-        }
         try:
-            response = requests.post(self.config["local_llm_endpoint"] + "/chat/completions", headers=headers, json=data)
             if response.status_code == 200:
                 result = response.json()
-                return result["choices"][0]["message"]["content"]
             else:
-                return "Lỗi khi gọi Local LLM."
         except Exception as e:
-            logger.error(f"Lỗi local LLM: {e}")
-            return "Lỗi phân tích với LLM local."
-    def _rule_based_analysis(self, transcript, dialect):
-        # Phân tích đơn giản không dùng LLM
-        return "Phân tích rule-based: " + transcript
     def clean_up(self):
-        self.async_processor.stop()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        logger.info("Clean up done.")
 def create_demo():
-    trainer = VietSpeechTrainer()
-    with gr.Blocks(title="Ứng dụng Luyện Nói & TTS", theme=gr.themes.Soft(primary_hue="blue")) as demo:
-        gr.Markdown("## Ứng dụng Luyện Nói & TTS (Tiếng Việt & Tiếng Anh)")
-        with gr.Tabs():
-            # Tab 1: TTS Tiếng Việt
-            with gr.Tab("TTS Tiếng Việt"):
-                vi_text_input = gr.Textbox(label="Nhập văn bản tiếng Việt")
-                vi_audio_output = gr.Audio(label="Kết quả âm thanh")
-                gen_vi_btn = gr.Button("Chuyển thành giọng nói")
-                def gen_vi_tts(txt):
-                    return trainer.text_to_speech(txt, language="vi", dialect=trainer.config["default_dialect"])
-                gen_vi_btn.click(fn=gen_vi_tts, inputs=vi_text_input, outputs=vi_audio_output)
-            # Tab 2: TTS Tiếng Anh (sử dụng CSM)
-            with gr.Tab("TTS Tiếng Anh"):
-                en_text_input = gr.Textbox(label="Enter English text")
-                en_audio_output = gr.Audio(label="Generated English Audio (CSM)")
-                gen_en_btn = gr.Button("Generate English Speech")
-                def gen_en_tts(txt):
-                    return trainer.text_to_speech(txt, language="en")
-                gen_en_btn.click(fn=gen_en_tts, inputs=en_text_input, outputs=en_audio_output)
-            # Tab 3: Luyện phát âm (Tiếng Việt)
-            with gr.Tab("Luyện phát âm"):
-                audio_input = gr.Audio("microphone", type="filepath", label="Giọng nói của bạn")
-                transcript_output = gr.Textbox(label="Transcript")
-                analysis_output = gr.Markdown(label="Phân tích")
-                analyze_btn = gr.Button("Phân tích")
-                def process_audio(audio_path):
-                    transcript = trainer.transcribe_audio(audio_path)
-                    analysis = trainer.analyze_text(transcript, dialect=trainer.config["default_dialect"])
-                    return transcript, analysis
-                analyze_btn.click(fn=process_audio, inputs=audio_input, outputs=[transcript_output, analysis_output])
-            # Tab 4: Thông tin & Hướng dẫn
-            with gr.Tab("Thông tin"):
-                gr.Markdown("""
-                ### Hướng dẫn sử dụng:
-                - **TTS Tiếng Việt:** Nhập văn bản tiếng Việt và nhấn "Chuyển thành giọng nói".
-                - **TTS Tiếng Anh (CSM):** Nhập English text và nhấn "Generate English Speech".
-                - **Luyện phát âm:** Thu âm giọng nói, sau đó nhấn "Phân tích" để xem transcript và phân tích.
-                ### Cấu hình LLM:
-                - **OpenAI:** Đặt biến môi trường `LLM_PROVIDER=openai` và `OPENAI_API_KEY` với key của bạn.
-                - **Gemini:** Đặt `LLM_PROVIDER=gemini` và `GEMINI_API_KEY`.
-                - **Local LLM:** Đặt `LLM_PROVIDER=local` và `LOCAL_LLM_ENDPOINT` với URL của server LLM nếu bạn có.
-                - **None:** Đặt `LLM_PROVIDER=none` để sử dụng phân tích rule-based.
-                ### Lưu ý:
-                - Để sử dụng TTS tiếng Anh (CSM), hãy bật biến `ENABLE_ENGLISH_TTS` (hoặc đặt `"enable_english_tts": true` trong config.json).
-                """)
-    return demo
 def main():
-    demo = create_demo()
-    # Sử dụng hàng đợi Gradio để xử lý tác vụ dài (ví dụ TTS CSM)
-    demo.queue()
-    demo.launch(server_name="0.0.0.0", server_port=7860)
 if __name__ == "__main__":
     main()

 import json
 import base64
 import io
+import shutil
 import random
 import logging
 from queue import Queue
 from threading import Thread
+import numpy as np
+import matplotlib.pyplot as plt
 import gradio as gr
 import torch
 import soundfile as sf
+import librosa
 import requests
 from transformers import pipeline, AutoTokenizer, AutoModel
+from scipy import signal
+# Cấu hình logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Kiểm tra và tạo thư mục cho dữ liệu
 os.makedirs("data", exist_ok=True)
 os.makedirs("data/audio", exist_ok=True)
 os.makedirs("data/reports", exist_ok=True)
 class AsyncProcessor:
+    """Xử lý các tác vụ nặng trong thread riêng để không làm đơ giao diện"""
     def __init__(self):
         self.task_queue = Queue()
         self.result_queue = Queue()
 class VietSpeechTrainer:
     def __init__(self):
+        # Cấu hình từ biến môi trường hoặc file cấu hình
         self.config = self._load_config()
         # Khởi tạo bộ xử lý bất đồng bộ
         self.async_processor = AsyncProcessor()
+        # Lưu trữ lịch sử
         self.session_history = []
         self.current_session_id = int(time.time())
+        # Trạng thái hội thoại
         self.current_scenario = None
         self.current_prompt_index = 0
+        # Khởi tạo các mô hình
         logger.info("Đang tải các mô hình...")
         self._initialize_models()
     def _load_config(self):
+        """Tải cấu hình từ file hoặc biến môi trường"""
         config = {
+            # STT config
+            "stt_model": os.environ.get("STT_MODEL", "nguyenvulebinh/wav2vec2-base-vietnamese-250h"),
+            "use_phowhisper": os.environ.get("USE_PHOWHISPER", "false").lower() == "true",
+            # NLP config
+            "use_phobert": os.environ.get("USE_PHOBERT", "false").lower() == "true",
+            "use_vncorenlp": os.environ.get("USE_VNCORENLP", "false").lower() == "true",
+            # LLM config
+            "llm_provider": os.environ.get("LLM_PROVIDER", "none"),  # "openai", "gemini", "local", "none"
+            "openai_api_key": os.environ.get("OPENAI_API_KEY", ""),
+            "gemini_api_key": os.environ.get("GEMINI_API_KEY", ""),
+            "local_llm_endpoint": os.environ.get("LOCAL_LLM_ENDPOINT", "http://localhost:8080/v1"),
+            # TTS config
+            "use_viettts": os.environ.get("USE_VIETTTS", "false").lower() == "true",
+            "tts_api_url": os.environ.get("TTS_API_URL", ""),
+            # Application settings
+            "default_dialect": os.environ.get("DEFAULT_DIALECT", "Bắc"),
+            "enable_pronunciation_eval": os.environ.get("ENABLE_PRONUNCIATION_EVAL", "false").lower() == "true",
+            # Advanced settings
+            "preprocess_audio": os.environ.get("PREPROCESS_AUDIO", "true").lower() == "true",
+            "save_history": os.environ.get("SAVE_HISTORY", "true").lower() == "true",
         }
+        # Nếu tồn tại file cấu hình, đọc thêm từ đó
         if os.path.exists("config.json"):
             try:
                 with open("config.json", "r", encoding="utf-8") as f:
                     file_config = json.load(f)
                     config.update(file_config)
             except Exception as e:
+                logger.error(f"Lỗi khi đọc file cấu hình: {e}")
         return config
     def _initialize_models(self):
+        """Khởi tạo các mô hình AI cần thiết"""
         try:
+            # 1. Khởi tạo mô hình STT
             if self.config["use_phowhisper"]:
+                logger.info("Đang tải PhoWhisper...")
+                self.stt_model = pipeline(
+                    "automatic-speech-recognition",
+                    model="vinai/PhoWhisper-small",
+                    device=0 if torch.cuda.is_available() else -1,
+                )
             else:
+                logger.info(f"Đang tải mô hình STT: {self.config['stt_model']}")
+                self.stt_model = pipeline(
+                    "automatic-speech-recognition",
+                    model=self.config["stt_model"],
+                    device=0 if torch.cuda.is_available() else -1,
+                )
+            # 2. Khởi tạo PhoBERT và VnCoreNLP nếu được cấu hình
+            self.phobert_model = None
+            self.phobert_tokenizer = None
+            self.rdrsegmenter = None
+            if self.config["use_phobert"]:
+                logger.info("Đang tải PhoBERT...")
+                try:
+                    self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+                    self.phobert_model = AutoModel.from_pretrained("vinai/phobert-base")
+                except Exception as e:
+                    logger.error(f"Lỗi khi tải PhoBERT: {e}")
+                    self.config["use_phobert"] = False
+            if self.config["use_vncorenlp"]:
+                logger.info("Đang chuẩn bị VnCoreNLP...")
+                try:
+                    vncorenlp_path = self._setup_vncorenlp()
+                    from py_vncorenlp import VnCoreNLP
+                    self.rdrsegmenter = VnCoreNLP(vncorenlp_path, annotators="wseg", max_heap_size="-Xmx500m")
+                except Exception as e:
+                    logger.error(f"Lỗi khi chuẩn bị VnCoreNLP: {e}")
+                    self.config["use_vncorenlp"] = False
+            # 3. Chuẩn bị VietTTS nếu được cấu hình
+            self.viettts_ready = False
+            if self.config["use_viettts"]:
+                logger.info("Đang chuẩn bị VietTTS...")
+                try:
+                    self.viettts_ready = self._setup_viettts()
+                except Exception as e:
+                    logger.error(f"Lỗi khi chuẩn bị VietTTS: {e}")
+                    self.config["use_viettts"] = False
+            logger.info("Khởi tạo mô hình hoàn tất")
         except Exception as e:
+            logger.error(f"Lỗi khi khởi tạo mô hình: {e}")
+            raise
+    def _setup_vncorenlp(self):
+        """Tải và cài đặt VnCoreNLP"""
+        vncorenlp_dir = "data/models/vncorenlp"
+        vncorenlp_jar = f"{vncorenlp_dir}/VnCoreNLP-1.1.1.jar"
+        os.makedirs(vncorenlp_dir, exist_ok=True)
+        if not os.path.exists(vncorenlp_jar):
+            logger.info("Đang tải VnCoreNLP...")
+            # Tải jar file
+            url = "https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar"
+            response = requests.get(url)
+            with open(vncorenlp_jar, "wb") as f:
+                f.write(response.content)
+            # Tạo thư mục models
+            os.makedirs(f"{vncorenlp_dir}/models/wordsegmenter", exist_ok=True)
+            # Tải models
+            for model_file in ["vi-vocab", "wordsegmenter.rdr"]:
+                url = f"https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/{model_file}"
+                response = requests.get(url)
+                with open(f"{vncorenlp_dir}/models/wordsegmenter/{model_file}", "wb") as f:
+                    f.write(response.content)
+        return vncorenlp_jar
+    def _setup_viettts(self):
+        """Cài đặt và chuẩn bị VietTTS"""
+        viettts_dir = "data/models/viettts"
+        # Nếu đã tải VietTTS rồi
+        if os.path.exists(f"{viettts_dir}/pretrained"):
+            return True
+        # Clone repo nếu chưa có
+        os.makedirs(viettts_dir, exist_ok=True)
+        if not os.path.exists(f"{viettts_dir}/.git"):
+            logger.info("Đang clone VietTTS repository...")
+            result = subprocess.run(
+                ["git", "clone", "https://github.com/NTT123/vietTTS.git", viettts_dir],
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode != 0:
+                logger.error(f"Lỗi khi clone VietTTS: {result.stderr}")
+                return False
+        # Cài đặt VietTTS
+        logger.info("Đang cài đặt VietTTS...")
+        os.chdir(viettts_dir)
+        result = subprocess.run(["pip", "install", "-e", "."], capture_output=True, text=True)
+        if result.returncode != 0:
+            logger.error(f"Lỗi khi cài đặt VietTTS: {result.stderr}")
+            os.chdir("..")
+            return False
+        # Tải mô hình pretrained
+        if not os.path.exists("pretrained"):
+            logger.info("Đang tải mô hình pretrained...")
+            result = subprocess.run(["bash", "scripts/quick_start.sh"], capture_output=True, text=True)
+            if result.returncode != 0:
+                logger.error(f"Lỗi khi tải mô hình pretrained: {result.stderr}")
+                os.chdir("..")
+                return False
+        os.chdir("..")
+        return True
+    def preprocess_audio(self, audio_path):
+        """Tiền xử lý âm thanh để cải thiện chất lượng"""
+        if not self.config["preprocess_audio"]:
+            return audio_path
         try:
+            # Đọc âm thanh
+            y, sr = librosa.load(audio_path, sr=16000)
+            # Chuẩn hóa âm lượng
+            y_normalized = librosa.util.normalize(y)
+            # Xử lý nhiễu (đơn giản)
+            y_filtered = self._simple_noise_reduction(y_normalized)
+            # Lưu file mới
+            processed_path = audio_path.replace(".wav", "_processed.wav")
+            sf.write(processed_path, y_filtered, sr)
+            return processed_path
         except Exception as e:
+            logger.error(f"Lỗi khi tiền xử lý âm thanh: {e}")
+            return audio_path
+    def _simple_noise_reduction(self, y):
+        """Áp dụng lọc nhiễu đơn giản"""
+        # Áp dụng high-pass filter để giảm nhiễu tần số thấp
+        b, a = signal.butter(5, 80 / (16000 / 2), "highpass")
+        y_filtered = signal.filtfilt(b, a, y)
+        return y_filtered
     def transcribe_audio(self, audio_path):
+        """Chuyển đổi âm thanh thành văn bản"""
         try:
+            # Tiền xử lý audio nếu cần
+            if self.config["preprocess_audio"]:
+                audio_path = self.preprocess_audio(audio_path)
+            # Thực hiện nhận dạng giọng nói
             result = self.stt_model(audio_path)
+            # Kết quả có thể có cấu trúc khác nhau tùy mô hình
             if isinstance(result, dict) and "text" in result:
+                text = result["text"]
             elif isinstance(result, list):
+                text = " ".join([chunk.get("text", "") for chunk in result])
             else:
+                text = str(result)
+            return text
         except Exception as e:
+            logger.error(f"Lỗi khi chuyển đổi âm thanh: {e}")
             return f"Lỗi: {str(e)}"
+    def segment_text(self, text):
+        """Tách từ văn bản tiếng Việt"""
+        if not text or not text.strip():
+            return text
+        # Nếu có VnCoreNLP, sử dụng RDRSegmenter
+        if self.config["use_vncorenlp"] and self.rdrsegmenter:
+            try:
+                sentences = self.rdrsegmenter.tokenize(text)
+                segmented_text = " ".join([" ".join(sentence) for sentence in sentences])
+                return segmented_text
+            except Exception as e:
+                logger.error(f"Lỗi khi tách từ với VnCoreNLP: {e}")
+        # Nếu không có VnCoreNLP hoặc lỗi, trả về nguyên bản
+        return text
     def analyze_text(self, transcript, dialect="Bắc"):
+        """Phân tích văn bản và đưa ra gợi ý cải thiện"""
+        if not transcript or not transcript.strip():
+            return "Không nhận được văn bản để phân tích."
+        # Tách từ
+        segmented_text = self.segment_text(transcript)
+        # Phân tích với LLM nếu có cấu hình
         llm_provider = self.config["llm_provider"]
         if llm_provider == "openai" and self.config["openai_api_key"]:
+            return self._analyze_with_openai(transcript, segmented_text, dialect)
         elif llm_provider == "gemini" and self.config["gemini_api_key"]:
+            return self._analyze_with_gemini(transcript, segmented_text, dialect)
         elif llm_provider == "local" and self.config["local_llm_endpoint"]:
+            return self._analyze_with_local_llm(transcript, segmented_text, dialect)
         else:
+            # Sử dụng phân tích dựa trên quy tắc
+            return self._rule_based_analysis(transcript, segmented_text, dialect)
+    def _analyze_with_openai(self, transcript, segmented_text, dialect):
+        """Phân tích văn bản sử dụng OpenAI API"""
         try:
+            headers = {
+                "Authorization": f"Bearer {self.config['openai_api_key']}",
+                "Content-Type": "application/json",
+            }
+            # Tạo prompt
+            prompt = self._create_analysis_prompt(transcript, segmented_text, dialect)
+            # Gọi API
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json={
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "Bạn là trợ lý dạy tiếng Việt, chuyên phân tích và đưa ra gợi ý cải thiện kỹ năng nói.",
+                        },
+                        {"role": "user", "content": prompt},
+                    ],
+                    "temperature": 0.5,
+                    "max_tokens": 800,
+                },
+            )
             if response.status_code == 200:
                 result = response.json()
+                analysis = result["choices"][0]["message"]["content"]
+                return analysis
             else:
+                logger.error(f"Lỗi khi gọi OpenAI API: {response.text}")
+                return self._rule_based_analysis(transcript, segmented_text, dialect)
         except Exception as e:
+            logger.error(f"Lỗi khi phân tích với OpenAI: {e}")
+            return self._rule_based_analysis(transcript, segmented_text, dialect)
+    def _analyze_with_gemini(self, transcript, segmented_text, dialect):
+        """Phân tích văn bản sử dụng Gemini API"""
         try:
+            headers = {
+                "Content-Type": "application/json",
+            }
+            # Tạo prompt
+            prompt = self._create_analysis_prompt(transcript, segmented_text, dialect)
+            # Endpoint Gemini
+            url = (
+                f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.0-pro:generateContent?key={self.config['gemini_api_key']}"
+            )
+            # Gọi API
+            response = requests.post(
+                url,
+                headers=headers,
+                json={
+                    "contents": [
+                        {
+                            "role": "user",
+                            "parts": [{"text": prompt}],
+                        }
+                    ],
+                    "generationConfig": {
+                        "temperature": 0.4,
+                        "maxOutputTokens": 800,
+                    },
+                },
+            )
             if response.status_code == 200:
                 result = response.json()
+                if "candidates" in result and len(result["candidates"]) > 0:
+                    analysis = result["candidates"][0]["content"]["parts"][0]["text"]
+                    return analysis
+                else:
+                    logger.error(f"Định dạng phản hồi Gemini không như mong đợi: {result}")
+                    return self._rule_based_analysis(transcript, segmented_text, dialect)
+            else:
+                logger.error(f"Lỗi khi gọi Gemini API: {response.text}")
+                return self._rule_based_analysis(transcript, segmented_text, dialect)
+        except Exception as e:
+            logger.error(f"Lỗi khi phân tích với Gemini: {e}")
+            return self._rule_based_analysis(transcript, segmented_text, dialect)
+    def _analyze_with_local_llm(self, transcript, segmented_text, dialect):
+        """Phân tích văn bản sử dụng LLM mã nguồn mở local"""
+        try:
+            headers = {
+                "Content-Type": "application/json",
+            }
+            # Tạo prompt
+            prompt = self._create_analysis_prompt(transcript, segmented_text, dialect)
+            # Endpoint local LLM
+            url = f"{self.config['local_llm_endpoint']}/chat/completions"
+            # Gọi API
+            response = requests.post(
+                url,
+                headers=headers,
+                json={
+                    "model": "local-model",
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "Bạn là trợ lý dạy tiếng Việt, chuyên phân tích và đưa ra gợi ý cải thiện kỹ năng nói.",
+                        },
+                        {"role": "user", "content": prompt},
+                    ],
+                    "temperature": 0.5,
+                    "max_tokens": 800,
+                },
+            )
+            if response.status_code == 200:
+                result = response.json()
+                analysis = result["choices"][0]["message"]["content"]
+                return analysis
+            else:
+                logger.error(f"Lỗi khi gọi Local LLM API: {response.text}")
+                return self._rule_based_analysis(transcript, segmented_text, dialect)
+        except Exception as e:
+            logger.error(f"Lỗi khi phân tích với Local LLM: {e}")
+            return self._rule_based_analysis(transcript, segmented_text, dialect)
+    def _create_analysis_prompt(self, transcript, segmented_text, dialect):
+        """Tạo prompt cho việc phân tích văn bản"""
+        return f"""Bạn là trợ lý dạy tiếng Việt. Hãy phân tích câu nói sau và đưa ra gợi ý cải thiện:
+Câu nói: "{transcript}"
+Câu đã tách từ: "{segmented_text}"
+Phương ngữ: {dialect}
+Hãy phân tích theo các khía cạnh sau:
+1. Ngữ pháp: Cấu trúc câu, thì, cách sử dụng từ nối
+2. Từ vựng: Từ không phù hợp, từ dùng không đúng ngữ cảnh, từ viết tắt
+3. Phong cách: Mức độ trang trọng, thân mật, văn phong
+4. Tính mạch lạc: Tính rõ ràng, dễ hiểu của câu
+Đưa ra gợi ý cụ thể để cải thiện cách diễn đạt.
+Viết câu mẫu cải thiện.
+Định dạng phản hồi:
+- Sử dụng Markdown
+- Đặt các vấn đề vào danh sách có đánh dấu
+- Đưa ra câu mẫu cải thiện ở cuối"""
+    def _rule_based_analysis(self, transcript, segmented_text, dialect):
+        """Phân tích dựa trên quy tắc đơn giản"""
+        # Phân tích cơ bản khi không có LLM
+        words = transcript.split()
+        analysis = []
+        # 1. Phân tích độ dài câu
+        if len(words) < 3:
+            analysis.append("⚠️ **Câu quá ngắn**: Thử mở rộng ý với các chi tiết hơn.")
+        elif len(words) > 20:
+            analysis.append("⚠️ **Câu dài**: Cân nhắc chia thành các câu ngắn hơn.")
+        else:
+            analysis.append("✅ **Độ dài câu**: Phù hợp.")
+        # 2. Kiểm tra từ ngữ phổ biến
+        common_errors = {
+            "ko": "không",
+            "k": "không",
+            "bik": "biết",
+            "j": "gì",
+            "z": "vậy",
+            "ntn": "như thế nào",
+            "dc": "được",
+            "vs": "với",
+            "nc": "nước",
+            "ng": "người",
+            "trc": "trước",
+            "sao": "sao",
+        }
+        errors_found = []
+        for word in words:
+            word_lower = word.lower()
+            if word_lower in common_errors:
+                errors_found.append(f"'{word}' → '{common_errors[word_lower]}'")
+        if errors_found:
+            analysis.append(f"⚠️ **Từ viết tắt**: Nên dùng từ đầy đủ thay vì: {', '.join(errors_found)}")
+        else:
+            analysis.append("✅ **Sử dụng từ**: Không phát hiện từ viết tắt phổ biến.")
+        # 3. Tính trùng lặp
+        word_counts = {}
+        for word in words:
+            word_lower = word.lower()
+            if len(word_lower) > 1:  # Bỏ qua các từ ngắn
+                word_counts[word_lower] = word_counts.get(word_lower, 0) + 1
+        duplicates = [w for w, c in word_counts.items() if c > 2]
+        if duplicates:
+            analysis.append(
+                f"⚠️ **Trùng lặp từ**: Từ '{', '.join(duplicates)}' lặp lại nhiều lần. Hãy thử dùng từ đồng nghĩa."
+            )
+        # 4. Gợi ý cải thiện phụ thuộc phương ngữ
+        if dialect == "Bắc":
+            suggestions = [
+                "Phát âm rõ ràng phụ âm cuối, tránh nuốt âm",
+                "Chú ý tới thanh điệu, đặc biệt là thanh hỏi và thanh ngã",
+                "Phát âm 'r' và 'gi' phân biệt theo phong cách Bắc Bộ",
+            ]
+        elif dialect == "Trung":
+            suggestions = [
+                "Chú ý đến nhịp điệu đặc trưng của giọng Trung",
+                "Phát âm rõ phụ âm đầu, đặc biệt là 'tr' và 'ch'",
+                "Kéo dài nguyên âm một cách tự nhiên",
+            ]
+        else:  # Nam
+            suggestions = [
+                "Giữ nguyên âm ổn định, tránh biến đổi nguyên âm",
+                "Phân biệt rõ 'v' và 'gi' theo phong cách Nam Bộ",
+                "Tránh nhấn quá mạnh vào các phụ âm cuối",
+            ]
+        # 5. Câu mẫu cải thiện
+        improved = transcript
+        for word, replacement in common_errors.items():
+            improved = improved.replace(f" {word} ", f" {replacement} ")
+        # Ghép tất cả phân tích lại
+        full_analysis = "### Phân tích\n\n" + "\n\n".join(analysis)
+        full_analysis += "\n\n### Gợi ý cải thiện\n\n" + "\n".join([f"- {s}" for s in suggestions])
+        full_analysis += f"\n\n### Câu gợi ý\n\n{improved}"
+        return full_analysis
+    def text_to_speech(self, text, dialect="Bắc"):
+        """Chuyển văn bản thành giọng nói"""
+        # Nếu có API TTS
+        if self.config["tts_api_url"]:
+            try:
+                # Gọi API TTS
+                response = requests.post(
+                    self.config["tts_api_url"], json={"text": text, "dialect": dialect.lower()}
+                )
+                if response.status_code == 200:
+                    # Lưu audio vào file tạm
+                    output_file = f"data/audio/tts_{int(time.time())}.wav"
+                    with open(output_file, "wb") as f:
+                        f.write(response.content)
+                    return output_file
+                else:
+                    logger.error(f"Lỗi khi gọi API TTS: {response.text}")
+                    return None
+            except Exception as e:
+                logger.error(f"Lỗi khi gọi API TTS: {e}")
+                return None
+        # Nếu có VietTTS
+        elif self.config["use_viettts"] and self.viettts_ready:
+            try:
+                # Chuẩn bị VietTTS
+                viettts_dir = "data/models/viettts"
+                # Tạo file tạm thời để lưu văn bản
+                with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt", encoding="utf-8") as f:
+                    f.write(text)
+                    text_file = f.name
+                # Tạo tên file output
+                output_file = f"data/audio/tts_{int(time.time())}.wav"
+                # Lưu thư mục hiện tại
+                current_dir = os.getcwd()
+                try:
+                    # Đổi thư mục làm việc sang viettts_dir
+                    os.chdir(viettts_dir)
+                    # Gọi VietTTS để tạo giọng nói
+                    cmd = [
+                        "python",
+                        "-m",
+                        "vietTTS.synthesizer",
+                        "--lexicon-file=./train_data/lexicon.txt",
+                        f"--text-file={text_file}",
+                        f"--output={os.path.join(current_dir, output_file)}",
+                    ]
+                    result = subprocess.run(cmd, capture_output=True, text=True)
+                    # Quay lại thư mục ban đầu
+                    os.chdir(current_dir)
+                    if result.returncode != 0:
+                        logger.error(f"Lỗi khi chạy VietTTS: {result.stderr}")
+                        return None
+                    # Xóa file tạm
+                    os.unlink(text_file)
+                    return output_file
+                except Exception as e:
+                    # Đảm bảo quay lại thư mục ban đầu
+                    os.chdir(current_dir)
+                    logger.error(f"Lỗi khi sử dụng VietTTS: {e}")
+                    os.unlink(text_file)
+                    return None
+            except Exception as e:
+                logger.error(f"Lỗi khi tạo file tạm: {e}")
+                return None
+        return None
+    def process_recording(self, audio_path, dialect="Bắc"):
+        """Xử lý bản ghi âm: chuyển sang văn bản và phân tích"""
+        if audio_path is None:
+            return "Không có âm thanh được ghi.", "", None
+        # 1. Chuyển đổi âm thanh thành văn bản
+        transcript = self.transcribe_audio(audio_path)
+        # 2. Phân tích văn bản
+        analysis = self.analyze_text(transcript, dialect)
+        # 3. Tạo mẫu phát âm (nếu có)
+        sample_audio = self.text_to_speech(transcript, dialect)
+        # 4. Lưu vào lịch sử phiên
+        entry = {
+            "id": len(self.session_history) + 1,
+            "time": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "transcript": transcript,
+            "analysis": analysis,
+            "audio_path": audio_path,
+            "sample_audio": sample_audio,
+            "dialect": dialect,
+        }
+        self.session_history.append(entry)
+        # 5. Lưu lịch sử nếu được cấu hình
+        if self.config["save_history"]:
+            self._save_session_history()
+        return transcript, analysis, sample_audio
+    def evaluate_pronunciation(self, original_audio, text, dialect="Bắc"):
+        """Đánh giá chất lượng phát âm bằng cách so sánh với mẫu chuẩn"""
+        if not self.config["enable_pronunciation_eval"]:
+            return {"score": 0, "feedback": "Tính năng đánh giá phát âm không được bật"}
+        try:
+            # 1. Tạo phát âm mẫu từ text
+            sample_audio = self.text_to_speech(text, dialect)
+            if not sample_audio:
+                return {"score": 0, "feedback": "Không thể tạo mẫu phát âm chuẩn"}
+            # 2. Trích xuất đặc trưng từ cả hai file âm thanh
+            # Trích xuất MFCCs (Mel-frequency cepstral coefficients)
+            def extract_mfcc(audio_file):
+                y, sr = librosa.load(audio_file, sr=16000)
+                mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
+                return mfccs
+            original_mfccs = extract_mfcc(original_audio)
+            sample_mfccs = extract_mfcc(sample_audio)
+            # 3. So sánh bằng DTW (Dynamic Time Warping)
+            # Đơn giản hóa: tính khoảng cách Euclidean giữa hai vector MFCC
+            # Trong thực tế, nên dùng DTW hoặc thuật toán phức tạp hơn
+            def dtw_distance(mfcc1, mfcc2):
+                # Chỉ lấy một phần của các frames để so sánh
+                min_len = min(mfcc1.shape[1], mfcc2.shape[1])
+                dist = np.linalg.norm(mfcc1[:, :min_len] - mfcc2[:, :min_len])
+                return dist
+            distance = dtw_distance(original_mfccs, sample_mfccs)
+            # 4. Tính điểm dựa trên khoảng cách
+            max_distance = 100  # Giá trị tối đa để chuẩn hóa
+            normalized_distance = min(distance, max_distance) / max_distance
+            pronunciation_score = 100 * (1 - normalized_distance)
+            # 5. Phản hồi
+            feedback = self._get_pronunciation_feedback(pronunciation_score, dialect)
+            evaluation = {
+                "score": round(pronunciation_score, 2),
+                "sample_audio": sample_audio,
+                "feedback": feedback,
+            }
+            return evaluation
+        except Exception as e:
+            logger.error(f"Lỗi khi đánh giá phát âm: {e}")
+            return {"score": 0, "feedback": f"Lỗi khi đánh giá: {str(e)}"}
+    def _get_pronunciation_feedback(self, score, dialect):
+        """Đưa ra phản hồi dựa trên điểm phát âm"""
+        prefix = f"**Phương ngữ {dialect}**: "
+        if score >= 90:
+            return prefix + "Phát âm rất tốt! Gần như giống với mẫu chuẩn."
+        elif score >= 80:
+            return prefix + "Phát âm tốt. Có một vài điểm nhỏ cần cải thiện."
+        elif score >= 70:
+            return prefix + "Phát âm khá tốt. Hãy chú ý đến ngữ điệu và các phụ âm cuối."
+        elif score >= 60:
+            return prefix + "Phát âm trung bình. Cần luyện tập thêm về nhịp điệu và độ rõ ràng."
+        else:
+            return prefix + "Cần luyện tập nhiều hơn. Hãy tập trung vào từng âm tiết và chú ý các dấu."
+    def _save_session_history(self):
+        """Lưu lịch sử phiên hiện tại vào file"""
+        try:
+            history_file = f"data/reports/session_{self.current_session_id}.json"
+            # Chuyển đổi thành JSON serializable
+            serializable_history = []
+            for entry in self.session_history:
+                # Tạo bản sao để không thay đổi bản gốc
+                entry_copy = entry.copy()
+                # Chỉ lưu đường dẫn, không lưu nội dung file
+                if "audio_path" in entry_copy and entry_copy["audio_path"]:
+                    entry_copy["audio_path"] = os.path.basename(entry_copy["audio_path"])
+                if "sample_audio" in entry_copy and entry_copy["sample_audio"]:
+                    entry_copy["sample_audio"] = os.path.basename(entry_copy["sample_audio"])
+                serializable_history.append(entry_copy)
+            with open(history_file, "w", encoding="utf-8") as f:
+                json.dump(
+                    {
+                        "session_id": self.current_session_id,
+                        "start_time": time.strftime(
+                            "%Y-%m-%d %H:%M:%S", time.localtime(self.current_session_id)
+                        ),
+                        "entries": serializable_history,
+                    },
+                    f,
+                    ensure_ascii=False,
+                    indent=2,
+                )
+        except Exception as e:
+            logger.error(f"Lỗi khi lưu lịch sử phiên: {e}")
+    def export_session(self, format="markdown"):
+        """Xuất báo cáo buổi luyện tập"""
+        if not self.session_history:
+            return None
+        try:
+            if format == "markdown":
+                return self._export_markdown()
+            elif format == "html":
+                return self._export_html()
             else:
+                return self._export_markdown()  # Mặc định là markdown
         except Exception as e:
+            logger.error(f"Lỗi khi xuất báo cáo: {e}")
+            return None
+    def _export_markdown(self):
+        """Xuất báo cáo dạng Markdown"""
+        # Tạo nội dung báo cáo
+        content = "# BÁO CÁO LUYỆN NÓI TIẾNG VIỆT\n\n"
+        content += f"Ngày: {time.strftime('%Y-%m-%d')}\n"
+        content += f"Tổng số câu: {len(self.session_history)}\n\n"
+        for entry in self.session_history:
+            content += f"## Câu {entry['id']} ({entry['time']})\n\n"
+            content += f"**Phương ngữ:** {entry['dialect']}\n\n"
+            content += f"**Bạn nói:** {entry['transcript']}\n\n"
+            content += f"**Phân tích:**\n{entry['analysis']}\n\n"
+            content += "---\n\n"
+        # Thêm thống kê tổng quát
+        content += "## Thống kê tổng quát\n\n"
+        # Tính số từ trung bình mỗi câu
+        avg_words = sum(len(entry["transcript"].split()) for entry in self.session_history) / len(
+            self.session_history
+        )
+        content += f"- Số từ trung bình mỗi câu: {avg_words:.2f}\n"
+        # Lưu báo cáo
+        filename = f"data/reports/bao_cao_{time.strftime('%Y%m%d_%H%M%S')}.md"
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(content)
+        return filename
+    def _export_html(self):
+        """Xuất báo cáo dạng HTML"""
+        # Tạo nội dung HTML
+        html = """<!DOCTYPE html>
+<html lang="vi">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Báo cáo luyện nói tiếng Việt</title>
+    <style>
+        body { font-family: Arial, sans-serif; line-height: 1.6; max-width: 800px; margin: 0 auto; padding: 20px; }
+        h1, h2 { color: #2c3e50; }
+        .entry { margin-bottom: 30px; border-bottom: 1px solid #eee; padding-bottom: 20px; }
+        .transcript { background-color: #f8f9fa; padding: 10px; border-left: 4px solid #4CAF50; }
+        .analysis { margin-top: 10px; }
+        .meta { color: #7f8c8d; font-size: 0.9em; }
+        .dialect { display: inline-block; background-color: #e74c3c; color: white; padding: 2px 6px; border-radius: 3px; font-size: 0.8em; }
+    </style>
+</head>
+<body>
+    <h1>Báo cáo luyện nói tiếng Việt</h1>
+    <p>Ngày: %s</p>
+    <p>Tổng số câu: %d</p>
+    <div class="entries">
+""" % (
+            time.strftime("%Y-%m-%d"),
+            len(self.session_history),
+        )
+        for entry in self.session_history:
+            html += f"""
+        <div class="entry">
+            <h2>Câu {entry['id']}</h2>
+            <div class="meta">Thời gian: {entry['time']} | <span class="dialect">{entry['dialect']}</span></div>
+            <div class="transcript">{entry['transcript']}</div>
+            <div class="analysis">{entry['analysis']}</div>
+        </div>
+"""
+        # Thêm thống kê
+        avg_words = sum(len(entry["transcript"].split()) for entry in self.session_history) / len(
+            self.session_history
+        )
+        html += f"""
+    </div>
+    <h2>Thống kê tổng quát</h2>
+    <ul>
+        <li>Số từ trung bình mỗi câu: {avg_words:.2f}</li>
+    </ul>
+</body>
+</html>
+"""
+        # Lưu báo cáo
+        filename = f"data/reports/bao_cao_{time.strftime('%Y%m%d_%H%M%S')}.html"
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(html)
+        return filename
+    def create_conversation_scenario(self):
+        """Tạo một tình huống hội thoại thực tế cho người dùng luyện tập"""
+        # Danh sách các tình huống
+        scenarios = [
+            {
+                "title": "Chào hỏi và giới thiệu bản thân",
+                "description": "Bạn gặp một người mới tại một sự kiện networking.",
+                "prompts": [
+                    "Chào bạn, mình là người tổ chức sự kiện. Bạn tên gì và đang làm việc ở đâu?",
+                    "Bạn có thể chia sẻ một chút về công việc của mình được không?",
+                    "Bạn quan tâm đến lĩnh vực nào trong sự kiện này?",
+                ],
+            },
+            {
+                "title": "Đặt món tại nhà hàng",
+                "description": "Bạn đang ở một nhà hàng và muốn gọi món.",
+                "prompts": [
+                    "Xin chào, tôi có thể giúp gì cho bạn?",
+                    "Bạn muốn đặt món gì? Hôm nay chúng tôi có món đặc biệt là cá hồi nướng.",
+                    "Bạn muốn uống thêm gì không? Chúng tôi có nhiều loại nước và rượu vang.",
+                ],
+            },
+            {
+                "title": "Phỏng vấn công việc",
+                "description": "Bạn đang trong một cuộc phỏng vấn xin việc.",
+                "prompts": [
+                    "Chào bạn, bạn có thể giới thiệu ngắn gọn về bản thân được không?",
+                    "Tại sao bạn muốn làm việc tại công ty chúng tôi?",
+                    "Bạn có kinh nghiệm gì liên quan đến vị trí này không?",
+                ],
+            },
+            {
+                "title": "Thuyết trình ý tưởng",
+                "description": "Bạn đang thuyết trình một ý tưởng mới cho đồng nghiệp.",
+                "prompts": [
+                    "Hãy giới thiệu về ý tưởng của bạn một cách ngắn gọn.",
+                    "Ý tưởng này giải quyết vấn đề gì và đối tượng hướng đến là ai?",
+                    "Bạn cần những nguồn lực gì để thực hiện ý tưởng này?",
+                ],
+            },
+            {
+                "title": "Hỏi đường",
+                "description": "Bạn đang du lịch và cần hỏi đường đến một địa điểm.",
+                "prompts": [
+                    "Xin chào, tôi có thể giúp gì cho bạn?",
+                    "Bạn đang tìm đường đến đâu?",
+                    "Bạn muốn đi bằng phương tiện gì? Đi bộ, xe buýt hay taxi?",
+                ],
+            },
+        ]
+        # Chọn ngẫu nhiên một tình huống
+        scenario = random.choice(scenarios)
+        return scenario
+    def track_progress(self):
+        """Theo dõi tiến độ của người dùng qua thời gian"""
+        if not self.session_history:
+            return {
+                "message": "Chưa có dữ liệu để theo dõi tiến độ",
+                "statistics": {},
+                "charts": {},
+            }
+        # Tính toán các chỉ số tiến triển
+        total_entries = len(self.session_history)
+        # Phân tích độ dài câu qua thời gian
+        sentence_lengths = [len(entry["transcript"].split()) for entry in self.session_history]
+        avg_length = sum(sentence_lengths) / total_entries
+        # Tính số từ độc đáo sử dụng
+        all_words = []
+        for entry in self.session_history:
+            all_words.extend(entry["transcript"].lower().split())
+        unique_words = set(all_words)
+        vocabulary_size = len(unique_words)
+        # Tạo báo cáo tiến độ
+        progress_report = {
+            "message": "Dữ liệu theo dõi tiến độ",
+            "statistics": {
+                "total_entries": total_entries,
+                "avg_sentence_length": round(avg_length, 2),
+                "vocabulary_size": vocabulary_size,
+                "improvement_score": min(100, int(total_entries * 5 + vocabulary_size / 10)),
+            },
+            "charts": self._generate_progress_charts(),
+        }
+        return progress_report
+    def _generate_progress_charts(self):
+        """Tạo biểu đồ trực quan hóa tiến độ"""
+        # Dữ liệu cho biểu đồ
+        sentence_ids = [entry["id"] for entry in self.session_history]
+        sentence_lengths = [len(entry["transcript"].split()) for entry in self.session_history]
+        # Tạo biểu đồ độ dài câu
+        plt.figure(figsize=(10, 5))
+        plt.plot(sentence_ids, sentence_lengths, marker="o", linestyle="-")
+        plt.title("Độ dài câu qua thời gian")
+        plt.xlabel("Số thứ tự câu")
+        plt.ylabel("Số từ trong câu")
+        plt.grid(True, linestyle="--", alpha=0.7)
+        # Lưu biểu đồ vào buffer
+        length_chart_buf = io.BytesIO()
+        plt.savefig(length_chart_buf, format="png", dpi=100)
+        length_chart_buf.seek(0)
+        length_chart_b64 = base64.b64encode(length_chart_buf.read()).decode("utf-8")
+        plt.close()
+        # Biểu đồ phân bố độ dài câu
+        plt.figure(figsize=(8, 4))
+        plt.hist(sentence_lengths, bins=10, alpha=0.7)
+        plt.title("Phân bố độ dài câu")
+        plt.xlabel("Số từ trong câu")
+        plt.ylabel("Tần suất")
+        plt.grid(True, linestyle="--", alpha=0.7)
+        dist_chart_buf = io.BytesIO()
+        plt.savefig(dist_chart_buf, format="png", dpi=100)
+        dist_chart_buf.seek(0)
+        dist_chart_b64 = base64.b64encode(dist_chart_buf.read()).decode("utf-8")
+        plt.close()
+        return {
+            "length_chart": f"data:image/png;base64,{length_chart_b64}",
+            "distribution_chart": f"data:image/png;base64,{dist_chart_b64}",
+        }
     def clean_up(self):
+        """Dọn dẹp tài nguyên trước khi thoát"""
+        # Lưu lịch sử phiên cuối cùng
+        if self.config["save_history"] and self.session_history:
+            self._save_session_history()
+        # Dừng bộ xử lý bất đồng bộ
+        if hasattr(self, "async_processor"):
+            self.async_processor.stop()
+        # Giải phóng bộ nhớ GPU nếu cần
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        logger.info("Đã dọn dẹp tài nguyên")
+# Tạo giao diện Gradio
 def create_demo():
+    try:
+        trainer = VietSpeechTrainer()
+        with gr.Blocks(title="Công cụ Luyện Nói Tiếng Việt", theme=gr.themes.Soft(primary_hue="blue")) as demo:
+            # Header
+            with gr.Row(variant="panel"):
+                with gr.Column(scale=6):
+                    gr.Markdown(
+                        """
+                    # 🎤 Công cụ Luyện Nói Tiếng Việt AI
+                    ### Nâng cao kỹ năng giao tiếp tiếng Việt với trợ lý AI thông minh
+                    """
+                    )
+                with gr.Column(scale=1):
+                    dialect_selector = gr.Radio(["Bắc", "Trung", "Nam"], label="Phương ngữ tiếng Việt", value="Bắc")
+            # Tabs for different functions
+            with gr.Tabs() as tabs:
+                # Tab 1: Luyện phát âm
+                with gr.TabItem("Luyện phát âm", id=0):
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            # Khu vực đầu vào
+                            audio_input = gr.Audio(
+                                label="📝 Giọng nói của bạn",
+                                type="filepath",
+                                source="microphone",
+                                format="wav",
+                            )
+                            with gr.Row():
+                                submit_btn = gr.Button("🔍 Phân tích", variant="primary")
+                                clear_btn = gr.Button("🗑️ Xóa")
+                            gr.Markdown(
+                                """
+                            ### Chủ đề gợi ý:
+                            - 🎯 Giới thiệu bản thân
+                            - 🎯 Kể về một trải nghiệm thú vị
+                            - 🎯 Mô tả một địa điểm yêu thích
+                            - 🎯 Trình bày quan điểm về một vấn đề
+                            """
+                            )
+                        with gr.Column(scale=3):
+                            # Khu vực kết quả
+                            transcript_output = gr.Textbox(
+                                label="Nội dung bạn vừa nói",
+                                placeholder="Nội dung sẽ hiển thị ở đây...",
+                                lines=3,
+                            )
+                            analysis_output = gr.Markdown(label="Phân tích và gợi ý cải thiện")
+                            with gr.Row():
+                                with gr.Column(scale=1):
+                                    gr.Markdown("#### Phát âm của bạn:")
+                                    playback_audio = gr.Audio(label="", type="filepath")
+                                with gr.Column(scale=1):
+                                    gr.Markdown("#### Phát âm mẫu:")
+                                    sample_audio = gr.Audio(label="", type="filepath")
+                    # Lịch sử phiên
+                    with gr.Accordion("Lịch sử phiên luyện tập", open=False):
+                        history_md = gr.Markdown("*Chưa có lịch sử luyện tập*")
+                # Tab 2: Hội thoại
+                with gr.TabItem("Hội thoại", id=1):
+                    scenario_title = gr.Markdown("## Tình huống hội thoại")
+                    scenario_desc = gr.Markdown("*Nhấn Tạo tình huống để bắt đầu*")
+                    prompt_text = gr.Markdown("*Câu hỏi/lời thoại sẽ hiển thị ở đây*")
+                    conversation_audio = gr.Audio(label="Trả lời của bạn", source="microphone", type="filepath")
+                    conversation_transcript = gr.Textbox(label="Văn bản của bạn", lines=2)
+                    conversation_feedback = gr.Markdown(label="Phản hồi")
+                    with gr.Row():
+                        new_scenario_btn = gr.Button("🔄 Tạo tình huống mới")
+                        next_prompt_btn = gr.Button("➡️ Câu tiếp theo")
+                        analyze_response_btn = gr.Button("🔍 Phân tích câu trả lời")
+                # Tab 3: Tiến độ
+                with gr.TabItem("Tiến độ", id=2):
+                    refresh_stats_btn = gr.Button("🔄 Cập nhật thống kê")
+                    with gr.Row():
+                        with gr.Column():
+                            stats_output = gr.JSON(label="Thống kê", value={"message": "Nhấn Cập nhật thống kê để xem"})
+                    with gr.Row():
+                        with gr.Column():
+                            length_chart = gr.Image(label="Độ dài câu qua thời gian", show_download_button=False)
+                        with gr.Column():
+                            dist_chart = gr.Image(label="Phân bố độ dài câu", show_download_button=False)
+                # Tab 4: Xuất báo cáo
+                with gr.TabItem("Xuất báo cáo", id=3):
+                    with gr.Row():
+                        export_md_btn = gr.Button("📝 Xuất báo cáo Markdown")
+                        export_html_btn = gr.Button("🌐 Xuất báo cáo HTML")
+                    export_output = gr.File(label="Tải báo cáo")
+                # Tab 5: Thông tin
+                with gr.TabItem("Thông tin", id=4):
+                    gr.Markdown(
+                        """
+                    ## Về công cụ luyện nói tiếng Việt
+                    Công cụ này sử dụng các mô hình trí tuệ nhân tạo tiên tiến để giúp người dùng cải thiện kỹ năng nói tiếng Việt.
+                    ### Công nghệ sử dụng
+                    - **Speech-to-Text**: Chuyển đổi giọng nói thành văn bản với độ chính xác cao
+                       - PhoWhisper hoặc wav2vec2-Vietnamese
+                    - **Phân tích ngôn ngữ**: Phân tích cấu trúc câu, phát hiện lỗi
+                       - PhoBERT kết hợp với LLM (Gemini/OpenAI/Local)
+                    - **Text-to-Speech**: Tạo mẫu phát âm chuẩn
+                       - VietTTS hoặc API TTS
+                    ### Tính năng chính
+                    - Nhận dạng và phân tích giọng nói tiếng Việt
+                    - Phát hiện lỗi ngữ pháp, từ vựng và cách diễn đạt
+                    - Phát âm mẫu chuẩn với VietTTS
+                    - Lưu trữ và theo dõi tiến độ
+                    - Gợi ý cải thiện cá nhân hóa
+                    - Hỗ trợ nhiều phương ngữ (Bắc, Trung, Nam)
+                    - Luyện tập hội thoại với tình huống thực tế
+                    ### Mô hình AI sử dụng
+                    - **PhoWhisper**: Mô hình nhận dạng giọng nói tiếng Việt tiên tiến nhất (2024), được phát triển bởi VinAI Research.
+                    - **PhoBERT**: Mô hình hiểu ngôn ngữ tự nhiên tiếng Việt SOTA, cũng được phát triển bởi VinAI Research.
+                    - **VietTTS**: Mô hình chuyển văn bản tiếng Việt thành giọng nói.
+                    ### Hướng dẫn sử dụng
+                    1. Chọn tab "Luyện phát âm" hoặc "Hội thoại"
+                    2. Thu âm giọng nói của bạn
+                    3. Nhận phản hồi và gợi ý cải thiện từ AI
+                    4. Theo dõi tiến độ trong tab "Tiến độ"
+                    5. Xuất báo cáo để lưu lại kết quả học tập
+                    """
+                    )
+            # Xử lý sự kiện
+            # 1. Tab Luyện phát âm
+            def process_and_display(audio, dialect):
+                if audio is None:
+                    return "Vui lòng thu âm trước khi phân tích.", "", None, None, None
+                # Xử lý bản ghi âm
+                transcript, analysis, sample_audio_path = trainer.process_recording(audio, dialect)
+                # Cập nhật lịch sử
+                history_html = update_history()
+                return transcript, analysis, audio, sample_audio_path, history_html
+            def update_history():
+                if not trainer.session_history:
+                    return "*Chưa có lịch sử luyện tập*"
+                history = "### Lịch sử phiên\n\n"
+                for entry in trainer.session_history[-10:]:  # Chỉ hiển thị 10 mục gần nhất
+                    short_t = entry["transcript"][:50]
+                    suffix = "..." if len(entry["transcript"]) > 50 else ""
+                    history += f"{entry['id']}. **{entry['time']}**: {short_t}{suffix}\n"
+                return history
+            def clear_inputs():
+                return None, "", "", None, None
+            submit_btn.click(
+                fn=process_and_display,
+                inputs=[audio_input, dialect_selector],
+                outputs=[transcript_output, analysis_output, playback_audio, sample_audio, history_md],
+            )
+            clear_btn.click(fn=clear_inputs, inputs=[], outputs=[audio_input, transcript_output, analysis_output, playback_audio, sample_audio])
+            # 2. Tab Hội thoại
+            current_scenario = gr.State(None)
+            current_prompt_index = gr.State(0)
+            def load_new_scenario():
+                scenario = trainer.create_conversation_scenario()
+                return (
+                    f"## {scenario['title']}",
+                    f"*{scenario['description']}*",
+                    f"**Bot**: {scenario['prompts'][0]}",
+                    scenario,
+                    0,
+                )
+            def next_prompt(scenario, prompt_index):
+                if scenario is None:
+                    return "Vui lòng tạo tình huống trước", prompt_index
+                next_index = prompt_index + 1
+                if next_index >= len(scenario["prompts"]):
+                    return "Đã hết các câu hỏi trong tình huống này. Hãy tạo tình huống mới!", prompt_index
+                return f"**Bot**: {scenario['prompts'][next_index]}", next_index
+            def analyze_conversation_response(audio, scenario, prompt_index, dialect):
+                if audio is None:
+                    return "Vui lòng ghi âm câu trả lời trước", ""
+                if scenario is None or prompt_index >= len(scenario["prompts"]):
+                    return "Không có tình huống hoặc câu hỏi hợp lệ", ""
+                # Xử lý âm thanh -> văn bản
+                transcript = trainer.transcribe_audio(audio)
+                # Phân tích câu trả lời trong ngữ cảnh
+                context = scenario["prompts"][prompt_index]
+                prompt = f"""Phân tích câu trả lời trong cuộc hội thoại:
+Ngữ cảnh: {context}
+Câu trả lời: {transcript}
+Phương ngữ: {dialect}
+Hãy đánh giá tính phù hợp của câu trả lời với ngữ cảnh, cách diễn đạt, và đưa ra gợi ý cải thiện.
+"""
+                # Sử dụng hàm phân tích với LLM (nếu có)
+                if trainer.config["llm_provider"] != "none":
+                    if trainer.config["llm_provider"] == "openai":
+                        analysis = trainer._analyze_with_openai(transcript, "", dialect)
+                    elif trainer.config["llm_provider"] == "gemini":
+                        analysis = trainer._analyze_with_gemini(transcript, "", dialect)
+                    elif trainer.config["llm_provider"] == "local":
+                        analysis = trainer._analyze_with_local_llm(transcript, "", dialect)
+                else:
+                    analysis = trainer._rule_based_analysis(transcript, "", dialect)
+                return transcript, analysis
+            new_scenario_btn.click(
+                fn=load_new_scenario,
+                inputs=[],
+                outputs=[scenario_title, scenario_desc, prompt_text, current_scenario, current_prompt_index],
+            )
+            next_prompt_btn.click(fn=next_prompt, inputs=[current_scenario, current_prompt_index], outputs=[prompt_text, current_prompt_index])
+            analyze_response_btn.click(
+                fn=analyze_conversation_response,
+                inputs=[conversation_audio, current_scenario, current_prompt_index, dialect_selector],
+                outputs=[conversation_transcript, conversation_feedback],
+            )
+            # 3. Tab Tiến độ
+            def update_statistics():
+                progress_data = trainer.track_progress()
+                stats = progress_data["statistics"]
+                charts = progress_data["charts"]
+                return stats, charts.get("length_chart", ""), charts.get("distribution_chart", "")
+            refresh_stats_btn.click(fn=update_statistics, inputs=[], outputs=[stats_output, length_chart, dist_chart])
+            # 4. Tab Xuất báo cáo
+            def export_markdown():
+                return trainer.export_session(format="markdown")
+            def export_html():
+                return trainer.export_session(format="html")
+            export_md_btn.click(fn=export_markdown, inputs=[], outputs=[export_output])
+            export_html_btn.click(fn=export_html, inputs=[], outputs=[export_output])
+            # Xử lý khi đóng ứng dụng
+            demo.load(lambda: None, inputs=None, outputs=None)
+        return demo
+    except Exception as e:
+        logger.error(f"Lỗi khi tạo giao diện: {e}")
+        raise
 def main():
+    try:
+        # Kiểm tra và tạo thư mục dữ liệu
+        os.makedirs("data", exist_ok=True)
+        os.makedirs("data/audio", exist_ok=True)
+        os.makedirs("data/reports", exist_ok=True)
+        os.makedirs("data/models", exist_ok=True)
+        # Tạo file cấu hình mẫu nếu chưa có
+        if not os.path.exists("config.json"):
+            sample_config = {
+                "stt_model": "nguyenvulebinh/wav2vec2-base-vietnamese-250h",
+                "use_phowhisper": False,
+                "use_phobert": False,
+                "use_vncorenlp": False,
+                "llm_provider": "none",
+                "use_viettts": False,
+                "default_dialect": "Bắc",
+                "preprocess_audio": True,
+                "save_history": True,
+            }
+            with open("config.json", "w", encoding="utf-8") as f:
+                json.dump(sample_config, f, ensure_ascii=False, indent=2)
+        # Tạo và khởi chạy ứng dụng
+        demo = create_demo()
+        demo.queue()
+        demo.launch(share=True)
+    except Exception as e:
+        logger.error(f"Lỗi khi khởi chạy ứng dụng: {e}")
+        print(f"Lỗi: {e}")
 if __name__ == "__main__":
     main()
+# Cải tiến:
+# - Đánh giá ngữ điệu: Phân tích cao độ, nhịp điệu và cảm xúc trong giọng nói
+# - Tùy chỉnh giọng TTS: Cho phép ngư���i dùng chọn giọng đọc mẫu
+# - Tạo bài tập cá nhân hóa: Dựa trên lỗi thường gặp của người dùng