import string
import warnings
from difflib import SequenceMatcher

import nltk
import numpy as np
import torch
from sentence_transformers import (
    SentenceTransformer,
    util,
)

from src.application.text.helper import extract_equal_text
from src.application.text.preprocessing import split_into_paragraphs
from src.application.text.search import (
    generate_search_phrases,
    search_by_google,
)
from src.application.url_reader import URLReader

warnings.simplefilter(action="ignore", category=FutureWarning)

# Download necessary NLTK data files
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)

# load the model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
PARAPHASE_MODEL.to(DEVICE)

BATCH_SIZE = 8

PARAPHRASE_THRESHOLD = 0.8
PARAPHRASE_THRESHOLD_FOR_OPPOSITE = 0.7
MIN_SAME_SENTENCE_LEN = 6
MIN_PHRASE_SENTENCE_LEN = 10
MIN_RATIO_PARAPHRASE_NUM = 0.7
MAX_CHAR_SIZE = 30000


def detect_text_by_relative_search(
    input_text,
    index,
    is_support_opposite=False,
):
    checked_urls = set()
    searched_phrases = generate_search_phrases(input_text[index])

    for candidate in searched_phrases:
        search_results = search_by_google(candidate)
        urls = [item["link"] for item in search_results.get("items", [])]

        for url in urls[:3]:
            if url in checked_urls:  # visited url
                continue
            if "bbc.com" not in url:
                continue

            checked_urls.add(url)
            print(f"\t\tChecking URL: {url}")

            content = URLReader(url)

            if content.is_extracted is True:
                if content.title is None or content.text is None:
                    print("\t\t\t↑↑↑ Title or text not found")
                    continue

                page_text = content.title + "\n" + content.text
                if len(page_text) > MAX_CHAR_SIZE:
                    print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                    continue
                print(f"\t\t\t↑↑↑ Title: {content.title}")
                paraphrase, aligned_first_sentences = check_paraphrase(
                    input_text[index],
                    page_text,
                    url,
                )

                if paraphrase is False:
                    return (
                        paraphrase,
                        url,
                        aligned_first_sentences,
                        content.images,
                        index,
                    )

                sub_paraphrase = True
                while sub_paraphrase is True:
                    index += 1
                    print(f"----search {index} < {len(input_text)}----")
                    if index >= len(input_text):
                        print(f"input_text_last: {input_text[-1]}")
                        break
                    print(f"input_text: {input_text[index]}")
                    sub_paraphrase, sub_sentences = check_paraphrase(
                        input_text[index],
                        page_text,
                        url,
                    )
                    print(f"sub_paraphrase: {sub_paraphrase}")
                    print(f"sub_sentences: {sub_sentences}")
                    if sub_paraphrase is True:
                        aligned_first_sentences["input_sentence"] += (
                            "<br>" + sub_sentences["input_sentence"]
                        )
                        aligned_first_sentences["matched_sentence"] += (
                            "<br>" + sub_sentences["matched_sentence"]
                        )
                        aligned_first_sentences["similarity"] += sub_sentences[
                            "similarity"
                        ]
                        aligned_first_sentences["similarity"] /= 2

                print(f"paraphrase: {paraphrase}")
                print(f"aligned_first_sentences: {aligned_first_sentences}")
                return (
                    paraphrase,
                    url,
                    aligned_first_sentences,
                    content.images,
                    index,
                )

    return False, None, [], [], index


def find_text_source(text, text_index, sentences_df):
    sentence = {
        "input_sentence": text[text_index],
        "matched_sentence": "",
        "label": "",
        "similarity": None,
        "paraphrase": None,
        "url": "",
        "group": None,
    }
    checked_urls = set()
    searched_phrases = generate_search_phrases(text[text_index])

    for candidate in searched_phrases:
        search_results = search_by_google(candidate)
        urls = [item["link"] for item in search_results.get("items", [])]

        for url in urls[:3]:
            if url in checked_urls:  # visited url
                continue
            if "bbc.com" not in url:
                continue

            checked_urls.add(url)
            print(f"\t\tChecking URL: {url}")

            content = URLReader(url)

            if content.is_extracted is True:
                if content.title is None or content.text is None:
                    print("\t\t\t↑↑↑ Title or text not found")
                    continue

                page_text = content.title + "\n" + content.text
                if len(page_text) > MAX_CHAR_SIZE:
                    print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                    continue
                print(f"\t\t\t↑↑↑ Title: {content.title}")
                paraphrase, aligned_sentence = check_paraphrase(
                    text,
                    page_text,
                    url,
                )

                # add one more key "group" into aligned_sentence
                sentences_df.loc[text_index, "input_sentence"] = (
                    aligned_sentence["input_sentence"]
                )
                sentences_df.loc[text_index, "matched_sentence"] = (
                    aligned_sentence["matched_sentence"]
                )
                sentences_df.loc[text_index, "label"] = aligned_sentence[
                    "label"
                ]
                sentences_df.loc[text_index, "similarity"] = aligned_sentence[
                    "similarity"
                ]
                sentences_df.loc[text_index, "url"] = aligned_sentence["url"]

                if aligned_sentence["paraphrase"] is False:
                    return paraphrase, sentences_df

                for text_index, _ in enumerate(sentences_df):
                    if sentences_df[text_index]["url"] is not None:
                        continue

                    # find content in new url
                    _, aligned_sentence = check_paraphrase(
                        text[text_index],
                        page_text,
                        url,
                    )

                    if aligned_sentence["url"] is not None:
                        continue

                    sentences_df.loc[text_index, "input_sentence"] = (
                        aligned_sentence["input_sentence"]
                    )
                    sentences_df.loc[text_index, "matched_sentence"] = (
                        aligned_sentence["matched_sentence"]
                    )
                    sentences_df.loc[text_index, "label"] = aligned_sentence[
                        "label"
                    ]
                    sentences_df.loc[text_index, "similarity"] = (
                        aligned_sentence["similarity"]
                    )
                    sentences_df.loc[text_index, "url"] = aligned_sentence[
                        "url"
                    ]

                return sentences_df, content.images

    return sentence, []


def longest_common_subsequence(arr1, arr2):
    """
    Finds the length of the longest common subsequence (contiguous) between
        two arrays.

    Args:
        arr1: The first array.
        arr2: The second array.

    Returns:
        The length of the longest common subsequence.
        Returns 0 if either input is invalid.
    """

    if not isinstance(arr1, list) or not isinstance(arr2, list):
        return 0

    n = len(arr1)
    m = len(arr2)

    if n == 0 or m == 0:  # handle empty list
        return 0

    # Create table dp with size (n+1) x (m+1)
    dp = [[0] * (m + 1) for _ in range(n + 1)]
    max_length = 0

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            if arr1[i - 1] == arr2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
                max_length = max(max_length, dp[i][j])
            else:
                dp[i][j] = 0  # set 0 since the array must be consecutive

    return max_length


def check_sentence(
    input_sentence,
    source_sentence,
    min_same_sentence_len,
    min_phrase_sentence_len,
    verbose=False,
):
    """
    Checks if two sentences are similar based on exact match or
        longest common subsequence.

    Args:
        input_sentence: The input sentence.
        source_sentence: The source sentence.
        min_same_sentence_len: Minimum length for exact sentence match.
        min_phrase_sentence_len: Minimum length for common subsequence match.
        verbose: If True, print debug information.

    Returns:
        True if the sentences are considered similar, False otherwise.
        Returns False if input is not valid.
    """

    if not isinstance(input_sentence, str) or not isinstance(
        source_sentence,
        str,
    ):
        return False

    input_sentence = input_sentence.strip()
    source_sentence = source_sentence.strip()

    if not input_sentence or not source_sentence:  # handle empty string
        return False

    input_words = input_sentence.split()  # split without arguments
    source_words = source_sentence.split()  # split without arguments

    if (
        input_sentence == source_sentence
        and len(input_words) >= min_same_sentence_len
    ):
        if verbose:
            print("Exact match found.")
        return True

    max_overlap_len = longest_common_subsequence(input_words, source_words)
    if verbose:
        print(f"Max overlap length: {max_overlap_len}")  # print overlap length
    if max_overlap_len >= min_phrase_sentence_len:
        return True

    return False


def check_paraphrase(input_text, page_text, url):
    """
    Checks if the input text is paraphrased in the content at the given URL.

    Args:
        input_text: The text to check for paraphrase.
        page_text: The text of the web page to compare with.
        url

    Returns:
        A tuple containing:

    """
    is_paraphrase_text = False

    if not isinstance(input_text, str) or not isinstance(page_text, str):
        return False, []

    # Extract sentences from input text and web page
    # input_sentences = split_into_paragraphs(input_text)
    input_sentences = [input_text]

    if not page_text:
        return is_paraphrase_text, []

    page_sentences = split_into_paragraphs(page_text)
    if not input_sentences or not page_sentences:
        return is_paraphrase_text, []

    additional_sentences = []
    for sentence in page_sentences:
        if ", external" in sentence:
            additional_sentences.append(sentence.replace(", external", ""))
    page_sentences.extend(additional_sentences)

    # Encode sentences into embeddings
    embeddings1 = PARAPHASE_MODEL.encode(
        input_sentences,
        convert_to_tensor=True,
        device=DEVICE,
    )
    embeddings2 = PARAPHASE_MODEL.encode(
        page_sentences,
        convert_to_tensor=True,
        device=DEVICE,
    )

    # Compute cosine similarity matrix
    similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()

    # Find sentence alignments
    alignment = {}
    paraphrased_sentence_count = 0
    best_matched_sentence = ""
    for i, sentence1 in enumerate(input_sentences):
        max_sim_index = np.argmax(similarity_matrix[i])
        max_similarity = similarity_matrix[i][max_sim_index]

        best_matched_sentence = page_sentences[max_sim_index]
        is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD

        if is_paraphrase_sentence is False:
            alignment = {
                "input_sentence": sentence1,
                "matched_sentence": "",
                "similarity": max_similarity,
                "label": "",
                "paraphrase": is_paraphrase_sentence,
                "url": "",
            }
        else:
            alignment = {
                "input_sentence": sentence1,
                "matched_sentence": page_sentences[max_sim_index],
                "similarity": max_similarity,
                "label": "",
                "paraphrase": is_paraphrase_sentence,
                "url": url,
            }

        # Check for individual sentence paraphrase
        # if overall paraphrase not yet found
        if not is_paraphrase_text and check_sentence(
            sentence1,
            page_sentences[max_sim_index],
            MIN_SAME_SENTENCE_LEN,
            MIN_PHRASE_SENTENCE_LEN,
        ):
            is_paraphrase_text = True

        # alignment.append(item)
        paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0

    # Check if enough sentences are paraphrases

    is_paraphrase_text = (
        paraphrased_sentence_count > 0
    )  # min_matching_sentences

    # Method 2: Check if overlapped words between sentences are more than 50%
    equal_idx_1, _ = extract_equal_text(
        input_sentences[0],
        best_matched_sentence,
    )
    matched_count = 0
    for index in equal_idx_1:
        matched_count += index["end"] - index["start"]
    sent = input_sentences[0].translate(
        str.maketrans("", "", string.punctuation),
    )
    num_words = len(sent.split())
    if matched_count > num_words / 2:
        is_paraphrase_text = True

    return is_paraphrase_text, alignment


def similarity_ratio(a, b):
    """
    Calculates the similarity ratio between two strings using SequenceMatcher.

    Args:
        a: The first string.
        b: The second string.

    Returns:
        A float representing the similarity ratio between 0.0 and 1.0.
        Returns 0.0 if either input is None or not a string.
    """
    if (
        not isinstance(a, str)
        or not isinstance(b, str)
        or a is None
        or b is None
    ):
        return 0.0  # Handle cases where inputs are not strings or None
    return SequenceMatcher(None, a, b).ratio()


def check_human(alligned_sentences):
    """
    Checks if a sufficient number of input sentences are found within
        source sentences.

    Returns:
        bool: True if the condition is met, False otherwise.
    """
    if not alligned_sentences:  # Handle empty data case
        return False

    if alligned_sentences["similarity"] >= 0.99:
        return True
    return False


if __name__ == "__main__":
    pass