import gradio as gr import warnings import requests from bs4 import BeautifulSoup import subprocess import io import ebooklib from ebooklib import epub from huggingface_hub import InferenceClient from epubsplit import SplitEpub import re import os import spaces def install_calibre(): try: subprocess.run(["apt-get", "update"], check=True) subprocess.run(["apt-get", "install", "-y", "calibre"], check=True), subprocess.run(["calibre-customize", "-a", "EpubSplit.zip"], check=True) print("Calibre installed successfully.") except subprocess.CalledProcessError as e: print(f"Error installing calibre: {e}") install_calibre() # Suppress specific warnings warnings.filterwarnings("ignore", message="In the future version we will turn default option ignore_ncx to True.") warnings.filterwarnings("ignore", message="This search incorrectly ignores the root element, and will be fixed in a future version.") # Constants EPUB_PATH = 'book.epub' OUTPUT_EPUB_PATH = 'output.epub' OUTPUT_PDF_PATH = 'output.pdf' LIBRARY_URL = os.getenv("LIBRARY_URL") COOKIE_CONFIG = { 'remix_userkey': os.getenv("LIBRARY_KEY"), 'remix_userid': '14009766', 'selectedSiteMode': 'books', 'domainsNotWorking': os.getenv("NOT_WORKING") } def fetch_library_search_url(): try: response = requests.get(LIBRARY_URL) soup = BeautifulSoup(response.content, 'html5lib') library_div = soup.find('div', attrs={'class': 'plainlist'}) if library_div: links = library_div.find_all('a', class_='external text') return next((link.get('href') for link in links if link.get('href').startswith('https')), "") except Exception as e: print(f"Error fetching library URL: {e}") return "" SEARCH_URL = fetch_library_search_url() def fetch_book_details(isbn): if not SEARCH_URL: print("Search URL not available.") return search_endpoint = f"{SEARCH_URL}/s/{isbn}" try: response = requests.get(search_endpoint) soup = BeautifulSoup(response.content, 'html5lib') bookcards = soup.find_all('z-bookcard') book_url = next((SEARCH_URL + card.get('href') for card in bookcards if card.get('href')), None) if not book_url: print("No book URL found.") return download_book(book_url) except Exception as e: print(f"Error fetching book details: {e}") def download_book(book_url): try: response = requests.get(book_url, cookies=COOKIE_CONFIG) soup = BeautifulSoup(response.content, 'html5lib') download_link = soup.find('a', class_='addDownloadedBook') if download_link and download_link.has_attr('href'): download_url = SEARCH_URL + download_link['href'] download_and_convert_epub(download_url) else: print("Download link not found or invalid.") except Exception as e: print(f"Error downloading book: {e}") def download_and_convert_epub(download_url): try: response = requests.get(download_url, cookies=COOKIE_CONFIG) if response.status_code == 200: with open(EPUB_PATH, 'wb') as epub_file: epub_file.write(response.content) print("EPUB downloaded successfully.") else: print(f"Failed to download EPUB. Status code: {response.status_code}") except Exception as e: print(f"Error downloading EPUB: {e}") def extract_chapter_text(input_epub_path, chapter_indices): print(f"Extracting chapter text for indices: {chapter_indices}") try: with open(input_epub_path, 'rb') as epub_file: split_epub = SplitEpub(epub_file) output_io = io.BytesIO() split_epub.write_split_epub(output_io, chapter_indices) with open(OUTPUT_EPUB_PATH, 'wb') as output_file: output_file.write(output_io.getvalue()) return read_text_from_epub(OUTPUT_EPUB_PATH) except Exception as e: print(f"Error extracting chapter text: {e}") return "" def read_text_from_epub(output_epub_path): try: book = epub.read_epub(output_epub_path) text_content = [] for item in book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: soup = BeautifulSoup(item.get_body_content(), 'html.parser') paragraphs = soup.find_all('p') text_content.extend(para.get_text() for para in paragraphs) return '\n'.join(text_content) except Exception as e: print(f"Error reading text from EPUB: {e}") return "" def generate_table_of_contents(): try: result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True) pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]') print(result) return {int(line_number): title for line_number, title in pattern.findall(result.stdout)} except Exception as e: print(f"Error generating table of contents: {e}") return {} def summarize_chapter(chapter_index): if chapter_index < 0: return "Invalid chapter selection." result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True) pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]') chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)] chapter_text = "" for i in range(chapter_index, chapter[chapter.index(chapter_index)+1]): chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i]) if chapter_to_summarize and len(chapter_to_summarize) > 50: chapter_text += generate_summary(chapter_to_summarize) chapter_text += "\n\n" if not chapter_text: chapter_to_summarize = extract_chapter_text(EPUB_PATH, [chapter_index+1]) if chapter_to_summarize and len(chapter_to_summarize) > 50: chapter_text += generate_summary(chapter_to_summarize) chapter_text += "\n\n" return chapter_text if chapter_text else "No content found for the selected chapter." @spaces.GPU(duration=100) def generate_summary(text): try: client = InferenceClient(api_key=TOKEN) user_prompt = ( "Provide a clear and concise summary of the chapter, emphasizing key events, themes, and character developments. " "Do not include introductory or concluding remarks, just focus on the main points." f"\n\nChapter Text:\n{text}" ) system_message = { "role": "system", "content": ( "You are an expert at summarizing book chapters. Your task is to condense the chapter into a focused, " "informative summary that highlights the most important events, themes, and character developments. " "Avoid filler and ensure the summary is succinct yet comprehensive." ) } messages = [ system_message, {"role": "user", "content": user_prompt} ] stream = client.chat.completions.create( model=MODEL, messages=messages, temperature=0.5, max_tokens=2048, top_p=0.7, stream=True ) out = "" for chunk in stream: if chunk.choices and len(chunk.choices) > 0: new_content = chunk.choices[0].delta.content out += new_content yield out return out except Exception as e: print(f"Error generating summary: {e}") return "Error generating summary." # Model Initialization MODEL = "meta-llama/Llama-3.3-70B-Instruct" TOKEN = os.getenv("TOKEN") # Gradio App with gr.Blocks() as app: isbn_input = gr.Textbox(label="Enter ISBN") chapter_dropdown = gr.Dropdown(label="Select Chapter", choices=[]) summary_output = gr.Textbox(label="Summary", lines=10, interactive=False) def update_chapter_dropdown(isbn): fetch_book_details(isbn) chapters = generate_table_of_contents() print(chapters) return gr.update(choices=[(title.strip('\''), line_number) for line_number, title in chapters.items()]) def stream_summarize_chapter(chapter_index): if chapter_index < 0: yield "Invalid chapter selection." return result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True) pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]') chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)] if not chapter: yield "No content found for the selected chapter." return for i in range(chapter_index, chapter[chapter.index(chapter_index) + 1]): chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i]) if chapter_to_summarize and len(chapter_to_summarize) > 50: for text_chunk in generate_summary(chapter_to_summarize): yield text_chunk else: yield "No significant content found for this chapter." isbn_input.change(update_chapter_dropdown, inputs=[isbn_input], outputs=[chapter_dropdown]) chapter_dropdown.change( stream_summarize_chapter, inputs=[chapter_dropdown], outputs=[summary_output] ) app.launch()