Spaces:

Awell00
/

book_summarizer

Sleeping

File size: 9,575 Bytes

import gradio as gr
import warnings
import requests
from bs4 import BeautifulSoup
import subprocess
import io
import ebooklib
from ebooklib import epub
from huggingface_hub import InferenceClient
from epubsplit import SplitEpub
import re
import os
import spaces

def install_calibre():
    try:
        subprocess.run(["apt-get", "update"], check=True)
        subprocess.run(["apt-get", "install", "-y", "calibre"], check=True),
        subprocess.run(["calibre-customize", "-a", "EpubSplit.zip"], check=True)
        print("Calibre installed successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Error installing calibre: {e}")

install_calibre()

# Suppress specific warnings
warnings.filterwarnings("ignore", message="In the future version we will turn default option ignore_ncx to True.")
warnings.filterwarnings("ignore", message="This search incorrectly ignores the root element, and will be fixed in a future version.")

# Constants
EPUB_PATH = 'book.epub'
OUTPUT_EPUB_PATH = 'output.epub'
OUTPUT_PDF_PATH = 'output.pdf'
LIBRARY_URL = os.getenv("LIBRARY_URL")
COOKIE_CONFIG = {
    'remix_userkey': os.getenv("LIBRARY_KEY"),
    'remix_userid': '14009766',
    'selectedSiteMode': 'books',
    'domainsNotWorking': os.getenv("NOT_WORKING")
}

def fetch_library_search_url():
    try:
        response = requests.get(LIBRARY_URL)
        soup = BeautifulSoup(response.content, 'html5lib')
        library_div = soup.find('div', attrs={'class': 'plainlist'})
        if library_div:
            links = library_div.find_all('a', class_='external text')
            return next((link.get('href') for link in links if link.get('href').startswith('https')), "")
    except Exception as e:
        print(f"Error fetching library URL: {e}")
    return ""

SEARCH_URL = fetch_library_search_url()

def fetch_book_details(isbn):
    if not SEARCH_URL:
        print("Search URL not available.")
        return

    search_endpoint = f"{SEARCH_URL}/s/{isbn}"
    try:
        response = requests.get(search_endpoint)
        soup = BeautifulSoup(response.content, 'html5lib')
        bookcards = soup.find_all('z-bookcard')

        book_url = next((SEARCH_URL + card.get('href') for card in bookcards if card.get('href')), None)
        if not book_url:
            print("No book URL found.")
            return

        download_book(book_url)
    except Exception as e:
        print(f"Error fetching book details: {e}")

def download_book(book_url):
    try:
        response = requests.get(book_url, cookies=COOKIE_CONFIG)
        soup = BeautifulSoup(response.content, 'html5lib')
        download_link = soup.find('a', class_='addDownloadedBook')

        if download_link and download_link.has_attr('href'):
            download_url = SEARCH_URL + download_link['href']
            download_and_convert_epub(download_url)
        else:
            print("Download link not found or invalid.")
    except Exception as e:
        print(f"Error downloading book: {e}")

def download_and_convert_epub(download_url):
    try:
        response = requests.get(download_url, cookies=COOKIE_CONFIG)
        if response.status_code == 200:
            with open(EPUB_PATH, 'wb') as epub_file:
                epub_file.write(response.content)
            print("EPUB downloaded successfully.")
        else:
            print(f"Failed to download EPUB. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading EPUB: {e}")
def extract_chapter_text(input_epub_path, chapter_indices):
    print(f"Extracting chapter text for indices: {chapter_indices}")
    try:
        with open(input_epub_path, 'rb') as epub_file:
            split_epub = SplitEpub(epub_file)
            output_io = io.BytesIO()
            split_epub.write_split_epub(output_io, chapter_indices)
            with open(OUTPUT_EPUB_PATH, 'wb') as output_file:
                output_file.write(output_io.getvalue())

        return read_text_from_epub(OUTPUT_EPUB_PATH)
    except Exception as e:
        print(f"Error extracting chapter text: {e}")
        return ""

def read_text_from_epub(output_epub_path):
    try:
        book = epub.read_epub(output_epub_path)
        text_content = []
        for item in book.get_items():
            if item.get_type() == ebooklib.ITEM_DOCUMENT:
                soup = BeautifulSoup(item.get_body_content(), 'html.parser')
                paragraphs = soup.find_all('p')
                text_content.extend(para.get_text() for para in paragraphs)
        return '\n'.join(text_content)
    except Exception as e:
        print(f"Error reading text from EPUB: {e}")
        return ""
def generate_table_of_contents():
    try:
        result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
        pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
        print(result)
        return {int(line_number): title for line_number, title in pattern.findall(result.stdout)}
    except Exception as e:
        print(f"Error generating table of contents: {e}")
        return {}

def summarize_chapter(chapter_index):
    if chapter_index < 0:
        return "Invalid chapter selection."

    result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
    pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
    chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)]

    chapter_text = ""
    for i in range(chapter_index, chapter[chapter.index(chapter_index)+1]):
        chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i])
        if chapter_to_summarize and len(chapter_to_summarize) > 50:
            chapter_text += generate_summary(chapter_to_summarize)
            chapter_text += "\n\n"

    if not chapter_text:
        chapter_to_summarize = extract_chapter_text(EPUB_PATH, [chapter_index+1])
        if chapter_to_summarize and len(chapter_to_summarize) > 50:
            chapter_text += generate_summary(chapter_to_summarize)
            chapter_text += "\n\n"
    return chapter_text if chapter_text else "No content found for the selected chapter."

@spaces.GPU(duration=100)
def generate_summary(text):
    try:
        client = InferenceClient(api_key=TOKEN)

        user_prompt = (
            "Provide a clear and concise summary of the chapter, emphasizing key events, themes, and character developments. "
            "Do not include introductory or concluding remarks, just focus on the main points."
            f"\n\nChapter Text:\n{text}"
        )

        system_message = {
            "role": "system",
            "content": (
                "You are an expert at summarizing book chapters. Your task is to condense the chapter into a focused, "
                "informative summary that highlights the most important events, themes, and character developments. "
                "Avoid filler and ensure the summary is succinct yet comprehensive."
            )
        }


        messages = [
            system_message,
            {"role": "user", "content": user_prompt}
        ]

        stream = client.chat.completions.create(
            model=MODEL,
            messages=messages,
            temperature=0.5,
            max_tokens=2048,
            top_p=0.7,
            stream=True
        )

        out = ""

        for chunk in stream:
            if chunk.choices and len(chunk.choices) > 0:
                new_content = chunk.choices[0].delta.content
                out += new_content
                yield out
        return out
    except Exception as e:
        print(f"Error generating summary: {e}")
        return "Error generating summary."

# Model Initialization
MODEL = "meta-llama/Llama-3.3-70B-Instruct"
TOKEN = os.getenv("TOKEN") 

# Gradio App
with gr.Blocks() as app:
    isbn_input = gr.Textbox(label="Enter ISBN")
    chapter_dropdown = gr.Dropdown(label="Select Chapter", choices=[])
    summary_output = gr.Textbox(label="Summary", lines=10, interactive=False)

    def update_chapter_dropdown(isbn):
        fetch_book_details(isbn)
        chapters = generate_table_of_contents()
        print(chapters)
        return gr.update(choices=[(title.strip('\''), line_number) for line_number, title in chapters.items()])

    def stream_summarize_chapter(chapter_index):
        if chapter_index < 0:
            yield "Invalid chapter selection."
            return

        result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
        pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
        chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)]

        if not chapter:
            yield "No content found for the selected chapter."
            return

        for i in range(chapter_index, chapter[chapter.index(chapter_index) + 1]):
            chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i])
            if chapter_to_summarize and len(chapter_to_summarize) > 50:
                for text_chunk in generate_summary(chapter_to_summarize):
                    yield text_chunk
            else:
                yield "No significant content found for this chapter."

    isbn_input.change(update_chapter_dropdown, inputs=[isbn_input], outputs=[chapter_dropdown])
    chapter_dropdown.change(
        stream_summarize_chapter, inputs=[chapter_dropdown], outputs=[summary_output]
    )

    app.launch()