|
import re
|
|
import numpy as np
|
|
import faiss
|
|
import torch
|
|
from sentence_transformers import SentenceTransformer
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
|
import gradio as gr
|
|
|
|
|
|
|
|
def preprocess_text(text):
|
|
"""Clean and preprocess the text by removing multiple newlines and extra spaces."""
|
|
text = re.sub(r'\n+', '\n', text)
|
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
|
return text.strip()
|
|
|
|
|
|
def chunk_text(text, max_chunk_size=500, overlap=100):
|
|
"""Chunk the text into smaller parts with overlap."""
|
|
chunks = []
|
|
start = 0
|
|
while start < len(text):
|
|
end = start + max_chunk_size
|
|
chunk = text[start:end]
|
|
chunks.append(chunk)
|
|
start += max_chunk_size - overlap
|
|
return chunks
|
|
|
|
|
|
def retrieve_relevant_chunks(query, k=3, return_score=False):
|
|
"""Retrieve the most relevant chunks from the script based on the query."""
|
|
query_embedding = embedding_model.encode([query])
|
|
distances, indices = index.search(np.array(query_embedding), k)
|
|
retrieved_chunks = [chunk_lookup[i] for i in indices[0]]
|
|
similarity_scores = [1 / (1 + d) for d in distances[0]]
|
|
context = "\n".join(retrieved_chunks)
|
|
top_score = similarity_scores[0]
|
|
return (context, top_score) if return_score else context
|
|
|
|
|
|
def build_prompt(query, context):
|
|
"""Build a prompt for Falcon-7B model with context."""
|
|
return f"""You are a helpful assistant that answers questions based only on the movie script context provided below.
|
|
|
|
Context:
|
|
{context}
|
|
|
|
Question: {query}
|
|
|
|
Do not answer using your own knowledge. Only use the context. If unsure or if the answer is not in the context, reply: "I cannot answer that as the information is not in the script"
|
|
Answer:"""
|
|
|
|
|
|
|
|
with open("LOTR_script.txt", "r", encoding="utf-8") as file:
|
|
movie_script = file.read()
|
|
|
|
movie_script = preprocess_text(movie_script)
|
|
chunks = chunk_text(movie_script)
|
|
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
embeddings = embedding_model.encode(chunks, show_progress_bar=True)
|
|
|
|
dimension = embeddings.shape[1]
|
|
index = faiss.IndexFlatL2(dimension)
|
|
index.add(np.array(embeddings))
|
|
chunk_lookup = {i: chunk for i, chunk in enumerate(chunks)}
|
|
|
|
|
|
model_name = "tiiuae/falcon-7b-instruct"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name,
|
|
trust_remote_code=True,
|
|
device_map="auto",
|
|
torch_dtype=torch.bfloat16
|
|
)
|
|
generator = pipeline(
|
|
"text-generation",
|
|
model=model,
|
|
tokenizer=tokenizer,
|
|
device_map="auto"
|
|
)
|
|
|
|
|
|
|
|
def answer_question(query):
|
|
"""Generate an answer to the query based on relevant chunks."""
|
|
context, avg_score = retrieve_relevant_chunks(query, k=3, return_score=True)
|
|
context_str = context[:1500]
|
|
threshold = 0.4
|
|
if avg_score < threshold:
|
|
return f"I don't know.\n\n📊 Avg Similarity Score: {round(avg_score, 2)} (Below threshold)"
|
|
|
|
prompt = build_prompt(query, context_str)
|
|
response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)[0]["generated_text"]
|
|
|
|
if "Answer:" in response:
|
|
answer = response.split("Answer:")[-1].strip()
|
|
else:
|
|
answer = response.strip()
|
|
|
|
return f"{answer}\n\n📊 Avg Similarity Score: {round(avg_score, 2)}"
|
|
|
|
|
|
|
|
predefined_questions = [
|
|
"What is the main goal of the Fellowship?",
|
|
"What is the relationship between Gandalf and Saruman?",
|
|
"How do the hobbits react when they first see the world outside the Shire?",
|
|
"What does the city of Isengard represent in Saruman’s betrayal?"
|
|
]
|
|
|
|
|
|
interface = gr.Interface(
|
|
fn=answer_question,
|
|
inputs=[
|
|
gr.Dropdown(choices=predefined_questions, label="Select a predefined question"),
|
|
gr.Textbox(lines=2, placeholder="Or enter your own question..."),
|
|
],
|
|
outputs="text",
|
|
title="🧝 LOTR Sage (Movie Q&A Bot)",
|
|
description="Ask questions about The Lord of the Rings (Fellowship of the Ring) movie script. Powered by FAISS + Falcon-7B."
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
interface.launch()
|
|
|