import nbformat import spacy import gradio as gr from transformers import pipeline from tokenize import tokenize from transformers import ( AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig, pipeline, ) import re import nltk PYTHON_CODE_MODEL = "sagard21/python-code-explainer" TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation" class NotebookEnhancer: def __init__(self): # models + tokenizer for generating titles from code summaries self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL) self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained( TITLE_SUMMARIZE_MODEL ) # models + tokenizer for generating summaries from Python code self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL) self.python_tokenizer = AutoTokenizer.from_pretrained( PYTHON_CODE_MODEL, padding=True ) self.python_pipeline = pipeline( "summarization", model=PYTHON_CODE_MODEL, config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL), tokenizer=self.python_tokenizer, ) # initiate the language model self.nlp = spacy.load("en_core_web_sm") def generate_title(self, summary: str): """Generate a concise title for a code cell""" inputs = self.title_tokenizer.batch_encode_plus( ["summarize: " + summary], max_length=1024, return_tensors="pt", padding=True, ) # Batch size 1 output = self.title_summarization_model.generate( **inputs, num_beams=8, do_sample=True, min_length=10, max_length=10 ) decoded_output = self.title_tokenizer.batch_decode( output, skip_special_tokens=True )[0] predicted_title = nltk.sent_tokenize(decoded_output.strip())[0] return f"# {predicted_title}" def _count_num_words(self, code): words = code.split(" ") return len(words) def generate_summary(self, code): """Generate a detailed summary for a code cell""" result = self.python_pipeline(code, min_length=5, max_length=64) summary = result[0]["summary_text"].strip() title, summary = self._postprocess_summary(summary) return f"# {title}", f"{summary}" def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode): """Add title and summary markdown cells before each code cell""" # Create a new notebook enhanced_notebook = nbformat.v4.new_notebook() enhanced_notebook.metadata = notebook.metadata # Process each cell i = 0 id = len(notebook.cells) + 1 while i < len(notebook.cells): cell = notebook.cells[i] # For code cells, add title and summary markdown cells if cell.cell_type == "code" and cell.source.strip(): # Generate summary title, summary = self.generate_summary(cell.source) summary_cell = nbformat.v4.new_markdown_cell(summary) summary_cell.outputs = [] summary_cell.id = id id += 1 title_cell = nbformat.v4.new_markdown_cell(title) title_cell.outputs = [] title_cell.id = id id += 1 enhanced_notebook.cells.append(title_cell) enhanced_notebook.cells.append(summary_cell) # Add the original cell cell.outputs = [] enhanced_notebook.cells.append(cell) i += 1 return enhanced_notebook def is_valid(self, words: list[str]): has_noun = False has_verb = False for word in words: if word.pos_ in ["NOUN", "PROPN", "PRON"]: has_noun = True if word.pos_ == "VERB": has_verb = True return has_noun and has_verb def _postprocess_summary(self, summary: str): doc = self.nlp(summary) sentences = list(doc.sents) # remove the trailing list enumeration postprocessed_sentences = [] for sentence in sentences: if self.is_valid(sentence): sentence_text = sentence.text sentence_text = re.sub("[0-9]+\.", "", sentence_text) postprocessed_sentences.append(sentence_text) title = postprocessed_sentences[0] summary = postprocessed_sentences[1:] return title, " ".join(summary) def process_notebook(file_path): """Process an uploaded notebook file""" enhancer = NotebookEnhancer() nb = None with open(file_path, "r", encoding="utf-8") as f: nb = nbformat.read(f, as_version=4) # Process the notebook enhanced_notebook = enhancer.enhance_notebook(nb) enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4) # Save to temp file output_path = "enhanced_notebook.ipynb" with open(output_path, "w", encoding="utf-8") as f: f.write(enhanced_notebook_str) return output_path def build_gradio_interface(): """Create and launch the Gradio interface""" with gr.Blocks(title="Notebook Enhancer") as demo: gr.Markdown("# Jupyter Notebook Enhancer") gr.Markdown( """ Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell. This tool uses Hugging Face models to: 1. Generate concise titles for code cells 2. Create explanatory summaries of what the code does """ ) with gr.Row(): with gr.Column(): file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)") print(file_input) process_btn = gr.Button("Enhance Notebook") with gr.Column(): output = gr.File(label="Enhanced Notebook") process_btn.click(fn=process_notebook, inputs=file_input, outputs=output) return demo # This will be the entry point when running the script if __name__ == "__main__": # file_input = "my_notebook.json" # test = process_notebook(file_input) demo = build_gradio_interface() demo.launch(share=True)