Spaces:

Manishkumaryadav
/

smart-document-explorer

Runtime error

File size: 5,767 Bytes

import gradio as gr
import pdfplumber
import pytesseract
import faiss
import nltk
import spacy
import re
import numpy as np
import os
import speech_recognition as sr
from gtts import gTTS
from nltk.corpus import stopwords
from PIL import Image
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Download stopwords and load NLP tools
nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

# Load AI models from Hugging Face
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
summarizer = pipeline("summarization", model="t5-small")
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# FAISS index for fast search
dimension = 384  # Embedding size
index = faiss.IndexFlatL2(dimension)

# Dummy database of documents (for recommendations)
document_database = {
    "Machine Learning Basics": "Introduction to ML, Supervised vs Unsupervised, Algorithms",
    "Deep Learning Advanced": "Neural Networks, CNN, RNN, Transformers",
    "Data Science Fundamentals": "Data Preprocessing, Feature Engineering, Statistics",
    "AI in Healthcare": "Medical Image Analysis, AI in Diagnosis, Predictive Analytics",
    "Blockchain Technology": "Decentralized Networks, Smart Contracts, Cryptography"
}

# Function to recommend relevant documents
def recommend_documents(query):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    doc_embeddings = embedder.encode(list(document_database.values()), convert_to_tensor=True)
    
    scores = util.pytorch_cos_sim(query_embedding, doc_embeddings).cpu().numpy()
    top_indices = np.argsort(scores[0])[-3:][::-1]  # Top 3 recommendations
    
    recommended_docs = [list(document_database.keys())[i] for i in top_indices]
    return recommended_docs

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return preprocess_text(text)

# Extract text from image using OCR
def extract_text_from_image(image_file):
    image = Image.open(image_file)
    return preprocess_text(pytesseract.image_to_string(image))

# Convert speech to text
def voice_to_text(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        return "Could not understand the audio."
    except sr.RequestError:
        return "Speech recognition service unavailable."

# Convert text to speech
def text_to_speech(answer_text):
    tts = gTTS(text=answer_text, lang="en")
    tts.save("response.mp3")
    return "response.mp3"

# Process document and answer questions
def document_processor(uploaded_file, query):
    text = ""

    # File type handling
    if uploaded_file.name.endswith(".pdf"):
        text = extract_text_from_pdf(uploaded_file.name)
    elif uploaded_file.name.endswith((".png", ".jpg", ".jpeg")):
        text = extract_text_from_image(uploaded_file.name)
    else:
        text = preprocess_text(uploaded_file.read().decode("utf-8"))

    # If user asks for a summary
    if query.lower() == "summarize":
        summary = summarizer(text, max_length=200, min_length=50, do_sample=False)
        return summary[0]["summary_text"], text_to_speech(summary[0]["summary_text"]), recommend_documents(summary[0]["summary_text"])

    # Multi-question processing
    queries = [q.strip() for q in query.split(";")]
    responses = {}

    for q in queries:
        # Sentence embeddings for better accuracy
        sentences = text.split(". ")
        sentence_embeddings = embedder.encode(sentences, convert_to_tensor=True)
        query_embedding = embedder.encode(q, convert_to_tensor=True)

        # Find most relevant sentence
        scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)
        best_sentence = sentences[np.argmax(scores.cpu().numpy())]

        # Generate answer
        answer = qa_pipeline(question=q, context=best_sentence)
        responses[q] = answer["answer"]

    # Convert answer to speech
    combined_answers = " ".join(responses.values())
    speech_output = text_to_speech(combined_answers)

    return responses, speech_output, recommend_documents(query)

# Gradio UI
with gr.Blocks() as app:
    gr.Markdown("# 📄 Smart Document Explorer 🚀")
    
    with gr.Row():
        uploaded_file = gr.File(label="📂 Upload Document (PDF, Image, or Text)")
    
    with gr.Row():
        query = gr.Textbox(label="💬 Ask Questions (Separate with ';') or Type 'summarize'", placeholder="e.g. What is the topic?; Who wrote it?")
    
    with gr.Row():
        voice_input = gr.Audio(label="🎤 Speak Your Query", type="filepath")
        voice_btn = gr.Button("🎙️ Convert Speech to Text")

    with gr.Row():
        output_text = gr.JSON(label="🧠 AI Response")
        output_audio = gr.Audio(label="🔊 AI Voice Answer", type="filepath")

    with gr.Row():
        recommendations = gr.JSON(label="📌 Recommended Topics")

    submit_btn = gr.Button("🚀 Process Document")
    
    # Button Actions
    voice_btn.click(voice_to_text, inputs=voice_input, outputs=query)
    submit_btn.click(document_processor, inputs=[uploaded_file, query], outputs=[output_text, output_audio, recommendations])

app.launch()