import os
import io
import re
from flask import Flask, request, jsonify
from flask_cors import CORS
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict

app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# Set NLTK data path to a directory included in the project
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)

# Ensure NLTK data is available (pre-downloaded)
try:
    stopwords.words('english')  # Test if stopwords are accessible
except LookupError:
    print("NLTK data not found. Please ensure 'punkt' and 'stopwords' are pre-downloaded in 'nltk_data'.")
    # Fallback will be used if this fails

# Allowed file extensions
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}

def allowed_file(filename):
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route("/", methods=["GET"])
def index():
    return "Document Summarizer API is running! Use /summarize endpoint for POST requests."

@app.route("/summarize", methods=["POST"])
def summarize():
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400
        
    file = request.files["file"]
    
    if file.filename == "":
        return jsonify({"error": "No selected file"}), 400
        
    if not allowed_file(file.filename):
        return jsonify({"error": "Unsupported file format"}), 400
        
    filename = secure_filename(file.filename)
    file_content = file.read()
    
    # Process file based on type
    text = None
    file_ext = filename.rsplit(".", 1)[1].lower()
    
    try:
        if file_ext == "pdf":
            text = extract_text_from_pdf(file_content)
        elif file_ext == "docx":
            text = extract_text_from_docx(file_content)
        elif file_ext == "pptx":
            text = extract_text_from_pptx(file_content)
        elif file_ext == "txt":
            text = extract_text_from_txt(file_content)
        
        # Generate a summary of the text
        try:
            summary = generate_summary(text)
        except LookupError as e:
            print(f"NLTK summarization failed: {e}. Using fallback.")
            summary = simple_summarize(text)
        except Exception as e:
            print(f"Summarization error: {e}")
            summary = text[:1000] + "..." if len(text) > 1000 else text
        
        # Include metadata
        word_count = len(text.split())
        
        return jsonify({
            "filename": filename,
            "summary": summary,
            "original_word_count": word_count,
            "summary_word_count": len(summary.split()) if summary else 0
        })
    except Exception as e:
        return jsonify({"error": f"Error processing file: {str(e)}"}), 500

# Text extraction functions
def extract_text_from_pdf(file_content):
    reader = PdfReader(io.BytesIO(file_content))
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n\n"
    return clean_text(text)

def extract_text_from_docx(file_content):
    doc = Document(io.BytesIO(file_content))
    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return clean_text(text)

def extract_text_from_pptx(file_content):
    ppt = Presentation(io.BytesIO(file_content))
    text = []
    for slide in ppt.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                text.append(shape.text)
    return clean_text("\n".join(text))

def extract_text_from_txt(file_content):
    text = file_content.decode("utf-8", errors="ignore")
    return clean_text(text)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
    return text.strip()

def generate_summary(text, sentence_count=5):
    if len(text.split()) < 100:
        return text
    
    sentences = sent_tokenize(text)
    if len(sentences) <= sentence_count:
        return text
    
    clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
    stop_words = set(stopwords.words('english'))
    
    word_frequencies = defaultdict(int)
    for sentence in clean_sentences:
        for word in word_tokenize(sentence):
            if word not in stop_words:
                word_frequencies[word] += 1
    
    max_frequency = max(word_frequencies.values()) if word_frequencies else 1
    for word in word_frequencies:
        word_frequencies[word] = word_frequencies[word] / max_frequency
    
    sentence_scores = defaultdict(int)
    for i, sentence in enumerate(clean_sentences):
        for word in word_tokenize(sentence):
            if word in word_frequencies:
                sentence_scores[i] += word_frequencies[word]
    
    top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
    top_indices.sort()
    
    return ' '.join([sentences[i] for i in top_indices])

def simple_summarize(text, max_chars=1000):
    paragraphs = text.split('\n\n')
    base_summary = ' '.join(paragraphs[:3])
    
    if len(text) <= max_chars:
        return text
    
    if len(base_summary) < max_chars:
        remaining_text = ' '.join(paragraphs[3:])
        sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
        for sentence in sentences:
            if len(base_summary) + len(sentence) + 1 <= max_chars:
                base_summary += ' ' + sentence
            else:
                break
    
    if len(base_summary) > max_chars:
        base_summary = base_summary[:max_chars] + "..."
        
    return base_summary

if __name__ == "__main__":
    # For local testing only
    app.run(host="0.0.0.0", port=7860)