from difflib import SequenceMatcher
import difflib
from src.application.highlight_text import generate_color
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
from src.application.text.model_detection import detect_text_by_ai_model
from src.application.text.preprocessing import split_into_sentences
from src.application.text.search_detection import check_human, detect_text_by_relative_search


class NewsVerification():
    def __init__(self):
        self.news_text = ""
        self.news_title = ""
        self.news_content = ""
        self.news_image = ""
        
        self.text_prediction_label:list[str] = []
        self.text_prediction_score:list[float] = []
        self.text_referent_url:list[str] = []
        self.image_prediction_label:list[str] = []
        self.image_prediction_score:list[str] = []
        self.image_referent_url:list[str] = []
        self.news_prediction_label = ""
        self.news_prediction_score = -1
        
        self.found_img_url:list[str] = []
        self.aligned_sentences:list[dict] = []
        self.is_paraphrased:list[bool] = []
        self.analyzed_table:list[list] = []
        
    def load_news(self, news_title, news_content, news_image):
        self.news_text = news_title + "\n\n" + news_content
        self.news_title = news_title
        self.news_content = news_content
        self.news_image = news_image

    def determine_text_origin(self):
        """
        Determines the origin of the given text based on paraphrasing detection and human authorship analysis.

        Args:
            text: The input text to be analyzed.

        Returns:
            str: The predicted origin of the text: 
                - "HUMAN": If the text is likely written by a human.
                - "MACHINE": If the text is likely generated by a machine.
        """
        print("CHECK TEXT:")
        print("\tFrom search engine:")
        # Classify by search engine
        input_sentences = split_into_sentences(self.news_text)
        for sentence in input_sentences:
            paraphrase, text_url, aligned_sentence, img_urls = detect_text_by_relative_search(sentence)
        
            text_prediction_label = "UNKNOWN"
            if paraphrase is False:
                # Classify text by AI model
                print("\tFrom AI model:")
                text_prediction_label, text_prediction_score = detect_text_by_ai_model(sentence)
                if aligned_sentence == []:
                    aligned_sentence = {
                        "input_sentence": sentence,
                        "matched_sentence": "",
                        "similarity": text_prediction_score,
                        "is_paraphrase_sentence": False,
                        "url": "",
                    }
            else:
                self.found_img_url.extend(img_urls)
                text_prediction_score = aligned_sentence["similarity"]
                if check_human(aligned_sentence):
                    text_prediction_label = "HUMAN"
                else:
                    text_prediction_label = "MACHINE"
            
            print(f"\ttext_prediction_label: {text_prediction_label}\n")
            self.text_prediction_label.append(text_prediction_label)
            self.aligned_sentences.append(aligned_sentence)
            self.is_paraphrased.append(paraphrase)
            self.text_referent_url.append(text_url)
            self.text_prediction_score.append(text_prediction_score)
            paraphrase = False
            text_url = ""
            aligned_sentence = {}
            img_urls = []

    def detect_image_origin(self):
        print("CHECK IMAGE:")
        if self.news_image is None:
            self.image_prediction_label = "UNKNOWN"
            self.image_prediction_score = 0.0
            self.image_referent_url = None
            return
        
        print(f"\t: Img path: {self.news_image}")    
        matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return
        
        matched_url, similarity = detect_image_by_reverse_search(self.news_image)
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return
        
        detected_label, score = detect_image_by_ai_model(self.news_image)
        if detected_label:
            self.image_prediction_label = detected_label
            self.image_prediction_score = score
            self.image_referent_url = None
            return
        
        self.image_prediction_label = "UNKNOWN"
        self.image_prediction_score = 50
        self.image_referent_url = None

    def determine_news_origin(self):
        if self.text_prediction_label == "MACHINE":
            text_prediction_score = 100 - self.text_prediction_score
        elif self.text_prediction_label == "UNKNOWN":
            text_prediction_score = 50
        else:
            text_prediction_score = self.text_prediction_score
            
        if self.image_prediction_label == "MACHINE":
            image_prediction_score = 100 - self.image_prediction_score
        elif self.image_prediction_label == "UNKNOWN":
            image_prediction_score = 50
        else:
            image_prediction_score = self.image_prediction_score
        
        news_prediction_score = (text_prediction_score + image_prediction_score) / 2
        if news_prediction_score > 50:
            self.news_prediction_score = news_prediction_score
            self.news_prediction_label = "HUMAN"
        else:
            self.news_prediction_score = 100 - news_prediction_score
            self.news_prediction_label = "MACHINE"

    def generate_analysis_report(self):
        self.determine_text_origin()
        self.detect_image_origin()

    def analyze_details(self):
        self.analyzed_table = []
        # IMAGES:
        
        
        # TEXT
        for pair in self.aligned_sentences:
            print(f"pair: {pair}")
            if "input_sentence" not in pair:
                continue
            input_words, source_words, input_indexes, source_indexes = (
                self.highlight_overlap_by_word_to_list(
                    pair["input_sentence"],
                    pair["matched_sentence"],
                )
                # self.compare_sentences(
                #     pair["input_sentence"],
                #     pair["matched_sentence"],
                # )
            )
            self.analyzed_table.append(
                (input_words, source_words, input_indexes, source_indexes),
            )
        
        if len(self.analyzed_table) != 0:
            html_table = self.create_table()
        else:
            html_table = ""
        return html_table
        
    def highlight_overlap_by_word_to_list(self, text1, text2):
        """
        Return
        - list of words in text1
        - list of words in text2
        - list of index of highlight words in text 1
        - list of index of highlight words in text 2
        """
        # Tách chuỗi thành các từ (word) dựa vào khoảng trắng
        words1 = text1.split()
        words2 = text2.split()

        index1 = []
        index2 = []

        # Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ
        matcher = SequenceMatcher(None, words1, words2)
        
        highlighted_text1 = []
        highlighted_text2 = []

        # Theo dõi vị trí hiện tại trong words1 và words2
        current_pos1 = 0
        current_pos2 = 0

        # Lặp qua các đoạn so khớp
        for match in matcher.get_matching_blocks():
            start1, start2, length = match
            print(start1, start2, length)

            # Thêm các từ không trùng lặp vào (giữ nguyên)
            highlighted_text1.extend(words1[current_pos1:start1])
            highlighted_text2.extend(words2[current_pos2:start2])

            if length > 0:
                for i in range(start1, start1 + length):
                    index1.append(i)
                for i in range(start2, start2 + length):
                    index2.append(i)
                
            # Cập nhật vị trí hiện tại
            current_pos1 = start1 + length
            current_pos2 = start2 + length
        
        return words1, words2, index1, index2
    
    
    def get_text_urls(self):
        return set(self.text_referent_url)
    
    def generate_colors_list(self, set_urls):
        color_dict = {}
        num_urls = len(set_urls)
        for i in range(num_urls):
            color_dict[i] = generate_color(i, num_urls)
        
        return color_dict            

    def analyze_details_2(self):
        html_text = ""
        
        self.analyzed_table = []
        # TEXT
        # Assign unique colors to each index
        set_urls = self.get_text_urls()
        color_dict = self.generate_colors_list(set_urls)

        # position of the color in the input contents
        position = 0
        for pair in self.aligned_sentences:
            if "input_sentence" not in pair:
                continue
            common_phrases, position = self.compare_sentences(
                pair["input_sentence"],
                pair["matched_sentence"],
                position,
                color_dict["0"],  # TODO: set color
            )
            
        
        if len(self.analyzed_table) != 0:
            html_table = self.create_table()
        else:
            html_table = ""
        return html_text, html_table
        
    def compare_sentences(self, sentence_1, sentence_2, position, color):
        """
        Compares two sentences and identifies common phrases, outputting their start and end positions.

        Args:
            sentence_1: The first sentence (string).
            sentence_2: The second sentence (string).

        Returns:
            A list of dictionaries, where each dictionary represents a common phrase and contains:
                - "phrase": The common phrase (string).
                - "start_1": The starting index of the phrase in sentence_1 (int).
                - "end_1": The ending index of the phrase in sentence_1 (int).
                - "start_2": The starting index of the phrase in sentence_2 (int).
                - "end_2": The ending index of the phrase in sentence_2 (int).
            Returns an empty list if no common phrases are found.  Handles edge cases like empty strings.
        """

        if not sentence_1 or not sentence_2:  # Handle empty strings
            return []

        s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
        common_phrases = []

        for block in s.get_matching_blocks():
            if block.size > 0:  # Ignore zero-length matches
                start_1 = block.a
                end_1 = block.a + block.size
                start_2 = block.b
                end_2 = block.b + block.size

                phrase = sentence_1[start_1:end_1]  # Or sentence_2[start_2:end_2], they are the same

                common_phrases.append({
                    "phrase": phrase,
                    "start_1": start_1 + position,
                    "end_1": end_1 + position,
                    "start_2": start_2,
                    "end_2": end_2,
                    "color": color,
                })
        position += len(sentence_1)
        return common_phrases, position

    def create_table(self):
        #table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table])
        # loop of self.analyzed_table with index:
        rows = []
        max_length = 30  # TODO: put this in configuration
        rows.append(self.format_image_row(max_length))
        
        for index, row in enumerate(self.analyzed_table):
            formatted_row = self.format_text_row(row, index, max_length)
            rows.append(formatted_row)
        table = "\n".join(rows)
        return f"""
        <h5>Comparison between input news and source news</h5>
        <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
            <thead>
                <tr>
                    <th>Input news</th>
                    <th>Source content</th>
                    <th>Forensic</th>
                    <th>Originality</th>
                </tr>
            </thead>
            <tbody>
                {table}
            </tbody>
        </table>
        
        <style>
    """
    
    def format_text_row(self, row, index = 0, max_length=30):
        input_sentence = self.highlight_text(row[0], row[2])  # text, index of highlight words
        source_sentence = self.highlight_text(row[1], row[3])  # text, index of highlight words
        
        url = self.aligned_sentences[index]["url"] #
        short_url = self.shorten_url(url, max_length)
        source_text_url = f"""<a href="{url}">{short_url}</a>"""
        
        # short_url = self.shorten_url(self.text_referent_url[index], max_length)
        # source_text_url = f"""<a href="{self.text_referent_url[index]}">{short_url}</a>"""
        
        self.text_prediction_score[index]
        return f"""<tr><td>{input_sentence}</td><td>{source_sentence}</td><td>{self.text_prediction_label[index]}<br>({self.text_prediction_score[index]*100:.2f}%)</td><td>{source_text_url}</td></tr>"""
        
    def format_image_row(self, max_length=30):        
        # input_image = f"""<img src="{self.news_image}" width="200" height="150">"""
        print(f"self.news_image = {self.news_image}")
        source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
        short_url = self.shorten_url(self.image_referent_url, max_length)
        source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
    
    def shorten_url(self, url, max_length=30):
        if url is None:
            return ""
        
        if len(url) > max_length:
            short_url = url[:max_length] + "..."
        else:
            short_url = url
        return short_url

    def highlight_text(self, words, indexes):
        final_words = words
        for index in indexes:
            final_words[index] = (
                f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
            )
        return " ".join(final_words)