Spaces:

irmchek
/

mynotebooksummary

Sleeping

App Files Files Community

mynotebooksummary / notebook_enhancer.py

irmchek

use public link for Gradio

d062c90 12 days ago

raw

history blame contribute delete

6.34 kB

	import nbformat
	import spacy
	import gradio as gr
	from transformers import pipeline
	from tokenize import tokenize
	from transformers import (
	AutoModelForSeq2SeqLM,
	AutoTokenizer,
	AutoConfig,
	pipeline,
	)
	import re
	import nltk

	PYTHON_CODE_MODEL = "sagard21/python-code-explainer"
	TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation"


	class NotebookEnhancer:
	def __init__(self):
	# models + tokenizer for generating titles from code summaries
	self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL)
	self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained(
	TITLE_SUMMARIZE_MODEL
	)

	# models + tokenizer for generating summaries from Python code
	self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL)
	self.python_tokenizer = AutoTokenizer.from_pretrained(
	PYTHON_CODE_MODEL, padding=True
	)
	self.python_pipeline = pipeline(
	"summarization",
	model=PYTHON_CODE_MODEL,
	config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL),
	tokenizer=self.python_tokenizer,
	)
	# initiate the language model
	self.nlp = spacy.load("en_core_web_sm")

	def generate_title(self, summary: str):
	"""Generate a concise title for a code cell"""
	inputs = self.title_tokenizer.batch_encode_plus(
	["summarize: " + summary],
	max_length=1024,
	return_tensors="pt",
	padding=True,
	) # Batch size 1
	output = self.title_summarization_model.generate(
	**inputs, num_beams=8, do_sample=True, min_length=10, max_length=10
	)
	decoded_output = self.title_tokenizer.batch_decode(
	output, skip_special_tokens=True
	)[0]
	predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
	return f"# {predicted_title}"

	def _count_num_words(self, code):
	words = code.split(" ")
	return len(words)

	def generate_summary(self, code):
	"""Generate a detailed summary for a code cell"""
	result = self.python_pipeline(code, min_length=5, max_length=64)
	summary = result[0]["summary_text"].strip()
	title, summary = self._postprocess_summary(summary)
	return f"# {title}", f"{summary}"

	def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
	"""Add title and summary markdown cells before each code cell"""
	# Create a new notebook
	enhanced_notebook = nbformat.v4.new_notebook()
	enhanced_notebook.metadata = notebook.metadata
	# Process each cell
	i = 0
	id = len(notebook.cells) + 1
	while i < len(notebook.cells):
	cell = notebook.cells[i]
	# For code cells, add title and summary markdown cells
	if cell.cell_type == "code" and cell.source.strip():
	# Generate summary
	title, summary = self.generate_summary(cell.source)
	summary_cell = nbformat.v4.new_markdown_cell(summary)
	summary_cell.outputs = []
	summary_cell.id = id
	id += 1
	title_cell = nbformat.v4.new_markdown_cell(title)
	title_cell.outputs = []
	title_cell.id = id
	id += 1

	enhanced_notebook.cells.append(title_cell)
	enhanced_notebook.cells.append(summary_cell)
	# Add the original cell
	cell.outputs = []
	enhanced_notebook.cells.append(cell)
	i += 1
	return enhanced_notebook

	def is_valid(self, words: list[str]):
	has_noun = False
	has_verb = False
	for word in words:
	if word.pos_ in ["NOUN", "PROPN", "PRON"]:
	has_noun = True
	if word.pos_ == "VERB":
	has_verb = True
	return has_noun and has_verb

	def _postprocess_summary(self, summary: str):
	doc = self.nlp(summary)
	sentences = list(doc.sents)
	# remove the trailing list enumeration
	postprocessed_sentences = []
	for sentence in sentences:
	if self.is_valid(sentence):
	sentence_text = sentence.text
	sentence_text = re.sub("[0-9]+\.", "", sentence_text)
	postprocessed_sentences.append(sentence_text)
	title = postprocessed_sentences[0]
	summary = postprocessed_sentences[1:]
	return title, " ".join(summary)


	def process_notebook(file_path):
	"""Process an uploaded notebook file"""
	enhancer = NotebookEnhancer()
	nb = None
	with open(file_path, "r", encoding="utf-8") as f:
	nb = nbformat.read(f, as_version=4)
	# Process the notebook
	enhanced_notebook = enhancer.enhance_notebook(nb)
	enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
	# Save to temp file
	output_path = "enhanced_notebook.ipynb"
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(enhanced_notebook_str)

	return output_path


	def build_gradio_interface():
	"""Create and launch the Gradio interface"""
	with gr.Blocks(title="Notebook Enhancer") as demo:
	gr.Markdown("# Jupyter Notebook Enhancer")
	gr.Markdown(
	"""
	Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.

	This tool uses Hugging Face models to:
	1. Generate concise titles for code cells
	2. Create explanatory summaries of what the code does
	"""
	)

	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
	print(file_input)
	process_btn = gr.Button("Enhance Notebook")

	with gr.Column():
	output = gr.File(label="Enhanced Notebook")

	process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)

	return demo


	# This will be the entry point when running the script
	if __name__ == "__main__":
	# file_input = "my_notebook.json"
	# test = process_notebook(file_input)
	demo = build_gradio_interface()
	demo.launch(share=True)