Spaces:
Sleeping
Sleeping
import nbformat | |
import spacy | |
import gradio as gr | |
from transformers import pipeline | |
from tokenize import tokenize | |
from transformers import ( | |
AutoModelForSeq2SeqLM, | |
AutoTokenizer, | |
AutoConfig, | |
pipeline, | |
) | |
import re | |
import nltk | |
PYTHON_CODE_MODEL = "sagard21/python-code-explainer" | |
TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation" | |
class NotebookEnhancer: | |
def __init__(self): | |
# models + tokenizer for generating titles from code summaries | |
self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL) | |
self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained( | |
TITLE_SUMMARIZE_MODEL | |
) | |
# models + tokenizer for generating summaries from Python code | |
self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL) | |
self.python_tokenizer = AutoTokenizer.from_pretrained( | |
PYTHON_CODE_MODEL, padding=True | |
) | |
self.python_pipeline = pipeline( | |
"summarization", | |
model=PYTHON_CODE_MODEL, | |
config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL), | |
tokenizer=self.python_tokenizer, | |
) | |
# initiate the language model | |
self.nlp = spacy.load("en_core_web_sm") | |
def generate_title(self, summary: str): | |
"""Generate a concise title for a code cell""" | |
inputs = self.title_tokenizer.batch_encode_plus( | |
["summarize: " + summary], | |
max_length=1024, | |
return_tensors="pt", | |
padding=True, | |
) # Batch size 1 | |
output = self.title_summarization_model.generate( | |
**inputs, num_beams=8, do_sample=True, min_length=10, max_length=10 | |
) | |
decoded_output = self.title_tokenizer.batch_decode( | |
output, skip_special_tokens=True | |
)[0] | |
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0] | |
return f"# {predicted_title}" | |
def _count_num_words(self, code): | |
words = code.split(" ") | |
return len(words) | |
def generate_summary(self, code): | |
"""Generate a detailed summary for a code cell""" | |
result = self.python_pipeline(code, min_length=5, max_length=64) | |
summary = result[0]["summary_text"].strip() | |
title, summary = self._postprocess_summary(summary) | |
return f"# {title}", f"{summary}" | |
def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode): | |
"""Add title and summary markdown cells before each code cell""" | |
# Create a new notebook | |
enhanced_notebook = nbformat.v4.new_notebook() | |
enhanced_notebook.metadata = notebook.metadata | |
# Process each cell | |
i = 0 | |
id = len(notebook.cells) + 1 | |
while i < len(notebook.cells): | |
cell = notebook.cells[i] | |
# For code cells, add title and summary markdown cells | |
if cell.cell_type == "code" and cell.source.strip(): | |
# Generate summary | |
title, summary = self.generate_summary(cell.source) | |
summary_cell = nbformat.v4.new_markdown_cell(summary) | |
summary_cell.outputs = [] | |
summary_cell.id = id | |
id += 1 | |
title_cell = nbformat.v4.new_markdown_cell(title) | |
title_cell.outputs = [] | |
title_cell.id = id | |
id += 1 | |
enhanced_notebook.cells.append(title_cell) | |
enhanced_notebook.cells.append(summary_cell) | |
# Add the original cell | |
cell.outputs = [] | |
enhanced_notebook.cells.append(cell) | |
i += 1 | |
return enhanced_notebook | |
def is_valid(self, words: list[str]): | |
has_noun = False | |
has_verb = False | |
for word in words: | |
if word.pos_ in ["NOUN", "PROPN", "PRON"]: | |
has_noun = True | |
if word.pos_ == "VERB": | |
has_verb = True | |
return has_noun and has_verb | |
def _postprocess_summary(self, summary: str): | |
doc = self.nlp(summary) | |
sentences = list(doc.sents) | |
# remove the trailing list enumeration | |
postprocessed_sentences = [] | |
for sentence in sentences: | |
if self.is_valid(sentence): | |
sentence_text = sentence.text | |
sentence_text = re.sub("[0-9]+\.", "", sentence_text) | |
postprocessed_sentences.append(sentence_text) | |
title = postprocessed_sentences[0] | |
summary = postprocessed_sentences[1:] | |
return title, " ".join(summary) | |
def process_notebook(file_path): | |
"""Process an uploaded notebook file""" | |
enhancer = NotebookEnhancer() | |
nb = None | |
with open(file_path, "r", encoding="utf-8") as f: | |
nb = nbformat.read(f, as_version=4) | |
# Process the notebook | |
enhanced_notebook = enhancer.enhance_notebook(nb) | |
enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4) | |
# Save to temp file | |
output_path = "enhanced_notebook.ipynb" | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write(enhanced_notebook_str) | |
return output_path | |
def build_gradio_interface(): | |
"""Create and launch the Gradio interface""" | |
with gr.Blocks(title="Notebook Enhancer") as demo: | |
gr.Markdown("# Jupyter Notebook Enhancer") | |
gr.Markdown( | |
""" | |
Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell. | |
This tool uses Hugging Face models to: | |
1. Generate concise titles for code cells | |
2. Create explanatory summaries of what the code does | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)") | |
print(file_input) | |
process_btn = gr.Button("Enhance Notebook") | |
with gr.Column(): | |
output = gr.File(label="Enhanced Notebook") | |
process_btn.click(fn=process_notebook, inputs=file_input, outputs=output) | |
return demo | |
# This will be the entry point when running the script | |
if __name__ == "__main__": | |
# file_input = "my_notebook.json" | |
# test = process_notebook(file_input) | |
demo = build_gradio_interface() | |
demo.launch(share=True) | |