Spaces:

irmchek
/

mynotebooksummary

Sleeping

App Files Files Community

mynotebooksummary / notebook_enhancer.py

irmchek

prototype notebook summarizer

462fea8 18 days ago

raw

history blame

6.03 kB

	import nbformat
	import spacy
	import gradio as gr
	from transformers import pipeline
	from tokenize import tokenize
	from transformers import (
	AutoModelForSeq2SeqLM,
	AutoTokenizer,
	AutoConfig,
	pipeline,
	SummarizationPipeline,
	)
	import re

	MODEL_NAME = "sagard21/python-code-explainer"


	class NotebookEnhancer:
	def __init__(self):
	self.config = AutoConfig.from_pretrained(MODEL_NAME)
	self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding=True)
	self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
	self.model.eval()
	self.pipeline = pipeline(
	"summarization",
	model=MODEL_NAME,
	config=self.config,
	tokenizer=self.tokenizer,
	)
	self.nlp = spacy.load("en_core_web_sm")

	def generate_title(self, code):
	"""Generate a concise title for a code cell"""
	# Limit input length to match model constraints
	max_length = len(code) // 2
	print("Title Max length", max_length)

	truncated_code = code[:max_length] if len(code) > max_length else code
	max_length = len(truncated_code) // 2
	title = self.pipeline(code, min_length=5, max_length=30)[0][
	"summary_text"
	].strip()

	print("Result title", title)
	# Format as a markdown title
	return f"# {title.capitalize()}"

	def _count_num_words(self, code):
	words = code.split(" ")
	return len(words)

	def generate_summary(self, code):
	"""Generate a detailed summary for a code cell"""
	# result = self.pipeline([code], min_length=3, max_length=len(code // 2))
	print("Code", code)
	result = self.pipeline(code, min_length=5, max_length=30)
	print(result)
	summary = result[0]["summary_text"].strip()
	summary = self._postprocess_summary(summary)
	print("Result summary", summary)
	# print(self._is_valid_sentence_nlp(summary))
	# summary = result[0]["summary_text"].strip()
	return f"{summary}"

	def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
	"""Add title and summary markdown cells before each code cell"""
	# Create a new notebook
	enhanced_notebook = nbformat.v4.new_notebook()
	enhanced_notebook.metadata = notebook.metadata
	print(len(notebook.cells))
	# Process each cell
	i = 0
	id = len(notebook.cells) + 1
	while i < len(notebook.cells):
	cell = notebook.cells[i]
	# For code cells, add title and summary markdown cells
	if cell.cell_type == "code" and cell.source.strip():
	# Generate summary
	summary = self.generate_summary(cell.source)
	summary_cell = nbformat.v4.new_markdown_cell(summary)
	summary_cell.outputs = []
	summary_cell.id = id
	id += 1

	# Generate title based on the summary cell
	title = self.generate_title(summary)
	title_cell = nbformat.v4.new_markdown_cell(title)
	title_cell.outputs = []
	title_cell.id = id
	id += 1

	enhanced_notebook.cells.append(title_cell)
	enhanced_notebook.cells.append(summary_cell)

	# Add the original cell
	cell.outputs = []
	enhanced_notebook.cells.append(cell)
	i += 1
	return enhanced_notebook

	def is_valid(self, words: list[str]):
	has_noun = False
	has_verb = False
	for word in words:
	if word.pos_ in ["NOUN", "PROPN", "PRON"]:
	has_noun = True
	if word.pos_ == "VERB":
	has_verb = True
	return has_noun and has_verb

	def _postprocess_summary(self, summary: str):
	doc = self.nlp(summary)
	sentences = list(doc.sents)
	# ignore the first sentence
	sentences = sentences[1:]
	# remove the trailing list enumeration
	postprocessed_sentences = []
	for sentence in sentences:
	if self.is_valid(sentence):
	postprocessed_sentences.append(sentence.text)
	return " ".join(postprocessed_sentences)


	def process_notebook(file_path):
	"""Process an uploaded notebook file"""
	enhancer = NotebookEnhancer()
	nb = None
	with open(file_path, "r", encoding="utf-8") as f:
	nb = nbformat.read(f, as_version=4)
	# Process the notebook
	enhanced_notebook = enhancer.enhance_notebook(nb)
	print(enhanced_notebook)
	enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
	# Save to temp file
	output_path = "enhanced_notebook.ipynb"
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(enhanced_notebook_str)

	return output_path


	def build_gradio_interface():
	"""Create and launch the Gradio interface"""
	with gr.Blocks(title="Notebook Enhancer") as demo:
	gr.Markdown("# Jupyter Notebook Enhancer")
	gr.Markdown(
	"""
	Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.

	This tool uses Hugging Face models to:
	1. Generate concise titles for code cells
	2. Create explanatory summaries of what the code does
	"""
	)

	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
	process_btn = gr.Button("Enhance Notebook")

	with gr.Column():
	output = gr.File(label="Enhanced Notebook")

	process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)

	return demo


	# This will be the entry point when running the script
	if __name__ == "__main__":
	file_input = "my_notebook.json"
	test = process_notebook(file_input)
	# demo = build_gradio_interface()
	# demo.launch()