mynotebooksummary / notebook_enhancer.py
irmchek's picture
use public link for Gradio
d062c90
import nbformat
import spacy
import gradio as gr
from transformers import pipeline
from tokenize import tokenize
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoConfig,
pipeline,
)
import re
import nltk
PYTHON_CODE_MODEL = "sagard21/python-code-explainer"
TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation"
class NotebookEnhancer:
def __init__(self):
# models + tokenizer for generating titles from code summaries
self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL)
self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained(
TITLE_SUMMARIZE_MODEL
)
# models + tokenizer for generating summaries from Python code
self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL)
self.python_tokenizer = AutoTokenizer.from_pretrained(
PYTHON_CODE_MODEL, padding=True
)
self.python_pipeline = pipeline(
"summarization",
model=PYTHON_CODE_MODEL,
config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL),
tokenizer=self.python_tokenizer,
)
# initiate the language model
self.nlp = spacy.load("en_core_web_sm")
def generate_title(self, summary: str):
"""Generate a concise title for a code cell"""
inputs = self.title_tokenizer.batch_encode_plus(
["summarize: " + summary],
max_length=1024,
return_tensors="pt",
padding=True,
) # Batch size 1
output = self.title_summarization_model.generate(
**inputs, num_beams=8, do_sample=True, min_length=10, max_length=10
)
decoded_output = self.title_tokenizer.batch_decode(
output, skip_special_tokens=True
)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
return f"# {predicted_title}"
def _count_num_words(self, code):
words = code.split(" ")
return len(words)
def generate_summary(self, code):
"""Generate a detailed summary for a code cell"""
result = self.python_pipeline(code, min_length=5, max_length=64)
summary = result[0]["summary_text"].strip()
title, summary = self._postprocess_summary(summary)
return f"# {title}", f"{summary}"
def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
"""Add title and summary markdown cells before each code cell"""
# Create a new notebook
enhanced_notebook = nbformat.v4.new_notebook()
enhanced_notebook.metadata = notebook.metadata
# Process each cell
i = 0
id = len(notebook.cells) + 1
while i < len(notebook.cells):
cell = notebook.cells[i]
# For code cells, add title and summary markdown cells
if cell.cell_type == "code" and cell.source.strip():
# Generate summary
title, summary = self.generate_summary(cell.source)
summary_cell = nbformat.v4.new_markdown_cell(summary)
summary_cell.outputs = []
summary_cell.id = id
id += 1
title_cell = nbformat.v4.new_markdown_cell(title)
title_cell.outputs = []
title_cell.id = id
id += 1
enhanced_notebook.cells.append(title_cell)
enhanced_notebook.cells.append(summary_cell)
# Add the original cell
cell.outputs = []
enhanced_notebook.cells.append(cell)
i += 1
return enhanced_notebook
def is_valid(self, words: list[str]):
has_noun = False
has_verb = False
for word in words:
if word.pos_ in ["NOUN", "PROPN", "PRON"]:
has_noun = True
if word.pos_ == "VERB":
has_verb = True
return has_noun and has_verb
def _postprocess_summary(self, summary: str):
doc = self.nlp(summary)
sentences = list(doc.sents)
# remove the trailing list enumeration
postprocessed_sentences = []
for sentence in sentences:
if self.is_valid(sentence):
sentence_text = sentence.text
sentence_text = re.sub("[0-9]+\.", "", sentence_text)
postprocessed_sentences.append(sentence_text)
title = postprocessed_sentences[0]
summary = postprocessed_sentences[1:]
return title, " ".join(summary)
def process_notebook(file_path):
"""Process an uploaded notebook file"""
enhancer = NotebookEnhancer()
nb = None
with open(file_path, "r", encoding="utf-8") as f:
nb = nbformat.read(f, as_version=4)
# Process the notebook
enhanced_notebook = enhancer.enhance_notebook(nb)
enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
# Save to temp file
output_path = "enhanced_notebook.ipynb"
with open(output_path, "w", encoding="utf-8") as f:
f.write(enhanced_notebook_str)
return output_path
def build_gradio_interface():
"""Create and launch the Gradio interface"""
with gr.Blocks(title="Notebook Enhancer") as demo:
gr.Markdown("# Jupyter Notebook Enhancer")
gr.Markdown(
"""
Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
This tool uses Hugging Face models to:
1. Generate concise titles for code cells
2. Create explanatory summaries of what the code does
"""
)
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
print(file_input)
process_btn = gr.Button("Enhance Notebook")
with gr.Column():
output = gr.File(label="Enhanced Notebook")
process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)
return demo
# This will be the entry point when running the script
if __name__ == "__main__":
# file_input = "my_notebook.json"
# test = process_notebook(file_input)
demo = build_gradio_interface()
demo.launch(share=True)