mynotebooksummary / notebook_enhancer.py
irmchek's picture
prototype notebook summarizer
462fea8
raw
history blame
6.03 kB
import nbformat
import spacy
import gradio as gr
from transformers import pipeline
from tokenize import tokenize
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoConfig,
pipeline,
SummarizationPipeline,
)
import re
MODEL_NAME = "sagard21/python-code-explainer"
class NotebookEnhancer:
def __init__(self):
self.config = AutoConfig.from_pretrained(MODEL_NAME)
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding=True)
self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
self.model.eval()
self.pipeline = pipeline(
"summarization",
model=MODEL_NAME,
config=self.config,
tokenizer=self.tokenizer,
)
self.nlp = spacy.load("en_core_web_sm")
def generate_title(self, code):
"""Generate a concise title for a code cell"""
# Limit input length to match model constraints
max_length = len(code) // 2
print("Title Max length", max_length)
truncated_code = code[:max_length] if len(code) > max_length else code
max_length = len(truncated_code) // 2
title = self.pipeline(code, min_length=5, max_length=30)[0][
"summary_text"
].strip()
print("Result title", title)
# Format as a markdown title
return f"# {title.capitalize()}"
def _count_num_words(self, code):
words = code.split(" ")
return len(words)
def generate_summary(self, code):
"""Generate a detailed summary for a code cell"""
# result = self.pipeline([code], min_length=3, max_length=len(code // 2))
print("Code", code)
result = self.pipeline(code, min_length=5, max_length=30)
print(result)
summary = result[0]["summary_text"].strip()
summary = self._postprocess_summary(summary)
print("Result summary", summary)
# print(self._is_valid_sentence_nlp(summary))
# summary = result[0]["summary_text"].strip()
return f"{summary}"
def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
"""Add title and summary markdown cells before each code cell"""
# Create a new notebook
enhanced_notebook = nbformat.v4.new_notebook()
enhanced_notebook.metadata = notebook.metadata
print(len(notebook.cells))
# Process each cell
i = 0
id = len(notebook.cells) + 1
while i < len(notebook.cells):
cell = notebook.cells[i]
# For code cells, add title and summary markdown cells
if cell.cell_type == "code" and cell.source.strip():
# Generate summary
summary = self.generate_summary(cell.source)
summary_cell = nbformat.v4.new_markdown_cell(summary)
summary_cell.outputs = []
summary_cell.id = id
id += 1
# Generate title based on the summary cell
title = self.generate_title(summary)
title_cell = nbformat.v4.new_markdown_cell(title)
title_cell.outputs = []
title_cell.id = id
id += 1
enhanced_notebook.cells.append(title_cell)
enhanced_notebook.cells.append(summary_cell)
# Add the original cell
cell.outputs = []
enhanced_notebook.cells.append(cell)
i += 1
return enhanced_notebook
def is_valid(self, words: list[str]):
has_noun = False
has_verb = False
for word in words:
if word.pos_ in ["NOUN", "PROPN", "PRON"]:
has_noun = True
if word.pos_ == "VERB":
has_verb = True
return has_noun and has_verb
def _postprocess_summary(self, summary: str):
doc = self.nlp(summary)
sentences = list(doc.sents)
# ignore the first sentence
sentences = sentences[1:]
# remove the trailing list enumeration
postprocessed_sentences = []
for sentence in sentences:
if self.is_valid(sentence):
postprocessed_sentences.append(sentence.text)
return " ".join(postprocessed_sentences)
def process_notebook(file_path):
"""Process an uploaded notebook file"""
enhancer = NotebookEnhancer()
nb = None
with open(file_path, "r", encoding="utf-8") as f:
nb = nbformat.read(f, as_version=4)
# Process the notebook
enhanced_notebook = enhancer.enhance_notebook(nb)
print(enhanced_notebook)
enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
# Save to temp file
output_path = "enhanced_notebook.ipynb"
with open(output_path, "w", encoding="utf-8") as f:
f.write(enhanced_notebook_str)
return output_path
def build_gradio_interface():
"""Create and launch the Gradio interface"""
with gr.Blocks(title="Notebook Enhancer") as demo:
gr.Markdown("# Jupyter Notebook Enhancer")
gr.Markdown(
"""
Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
This tool uses Hugging Face models to:
1. Generate concise titles for code cells
2. Create explanatory summaries of what the code does
"""
)
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
process_btn = gr.Button("Enhance Notebook")
with gr.Column():
output = gr.File(label="Enhanced Notebook")
process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)
return demo
# This will be the entry point when running the script
if __name__ == "__main__":
file_input = "my_notebook.json"
test = process_notebook(file_input)
# demo = build_gradio_interface()
# demo.launch()