Spaces:
Sleeping
Sleeping
import nbformat | |
import spacy | |
import gradio as gr | |
from transformers import pipeline | |
from tokenize import tokenize | |
from transformers import ( | |
AutoModelForSeq2SeqLM, | |
AutoTokenizer, | |
AutoConfig, | |
pipeline, | |
SummarizationPipeline, | |
) | |
import re | |
MODEL_NAME = "sagard21/python-code-explainer" | |
class NotebookEnhancer: | |
def __init__(self): | |
self.config = AutoConfig.from_pretrained(MODEL_NAME) | |
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding=True) | |
self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) | |
self.model.eval() | |
self.pipeline = pipeline( | |
"summarization", | |
model=MODEL_NAME, | |
config=self.config, | |
tokenizer=self.tokenizer, | |
) | |
self.nlp = spacy.load("en_core_web_sm") | |
def generate_title(self, code): | |
"""Generate a concise title for a code cell""" | |
# Limit input length to match model constraints | |
max_length = len(code) // 2 | |
print("Title Max length", max_length) | |
truncated_code = code[:max_length] if len(code) > max_length else code | |
max_length = len(truncated_code) // 2 | |
title = self.pipeline(code, min_length=5, max_length=30)[0][ | |
"summary_text" | |
].strip() | |
print("Result title", title) | |
# Format as a markdown title | |
return f"# {title.capitalize()}" | |
def _count_num_words(self, code): | |
words = code.split(" ") | |
return len(words) | |
def generate_summary(self, code): | |
"""Generate a detailed summary for a code cell""" | |
# result = self.pipeline([code], min_length=3, max_length=len(code // 2)) | |
print("Code", code) | |
result = self.pipeline(code, min_length=5, max_length=30) | |
print(result) | |
summary = result[0]["summary_text"].strip() | |
summary = self._postprocess_summary(summary) | |
print("Result summary", summary) | |
# print(self._is_valid_sentence_nlp(summary)) | |
# summary = result[0]["summary_text"].strip() | |
return f"{summary}" | |
def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode): | |
"""Add title and summary markdown cells before each code cell""" | |
# Create a new notebook | |
enhanced_notebook = nbformat.v4.new_notebook() | |
enhanced_notebook.metadata = notebook.metadata | |
print(len(notebook.cells)) | |
# Process each cell | |
i = 0 | |
id = len(notebook.cells) + 1 | |
while i < len(notebook.cells): | |
cell = notebook.cells[i] | |
# For code cells, add title and summary markdown cells | |
if cell.cell_type == "code" and cell.source.strip(): | |
# Generate summary | |
summary = self.generate_summary(cell.source) | |
summary_cell = nbformat.v4.new_markdown_cell(summary) | |
summary_cell.outputs = [] | |
summary_cell.id = id | |
id += 1 | |
# Generate title based on the summary cell | |
title = self.generate_title(summary) | |
title_cell = nbformat.v4.new_markdown_cell(title) | |
title_cell.outputs = [] | |
title_cell.id = id | |
id += 1 | |
enhanced_notebook.cells.append(title_cell) | |
enhanced_notebook.cells.append(summary_cell) | |
# Add the original cell | |
cell.outputs = [] | |
enhanced_notebook.cells.append(cell) | |
i += 1 | |
return enhanced_notebook | |
def is_valid(self, words: list[str]): | |
has_noun = False | |
has_verb = False | |
for word in words: | |
if word.pos_ in ["NOUN", "PROPN", "PRON"]: | |
has_noun = True | |
if word.pos_ == "VERB": | |
has_verb = True | |
return has_noun and has_verb | |
def _postprocess_summary(self, summary: str): | |
doc = self.nlp(summary) | |
sentences = list(doc.sents) | |
# ignore the first sentence | |
sentences = sentences[1:] | |
# remove the trailing list enumeration | |
postprocessed_sentences = [] | |
for sentence in sentences: | |
if self.is_valid(sentence): | |
postprocessed_sentences.append(sentence.text) | |
return " ".join(postprocessed_sentences) | |
def process_notebook(file_path): | |
"""Process an uploaded notebook file""" | |
enhancer = NotebookEnhancer() | |
nb = None | |
with open(file_path, "r", encoding="utf-8") as f: | |
nb = nbformat.read(f, as_version=4) | |
# Process the notebook | |
enhanced_notebook = enhancer.enhance_notebook(nb) | |
print(enhanced_notebook) | |
enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4) | |
# Save to temp file | |
output_path = "enhanced_notebook.ipynb" | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write(enhanced_notebook_str) | |
return output_path | |
def build_gradio_interface(): | |
"""Create and launch the Gradio interface""" | |
with gr.Blocks(title="Notebook Enhancer") as demo: | |
gr.Markdown("# Jupyter Notebook Enhancer") | |
gr.Markdown( | |
""" | |
Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell. | |
This tool uses Hugging Face models to: | |
1. Generate concise titles for code cells | |
2. Create explanatory summaries of what the code does | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)") | |
process_btn = gr.Button("Enhance Notebook") | |
with gr.Column(): | |
output = gr.File(label="Enhanced Notebook") | |
process_btn.click(fn=process_notebook, inputs=file_input, outputs=output) | |
return demo | |
# This will be the entry point when running the script | |
if __name__ == "__main__": | |
file_input = "my_notebook.json" | |
test = process_notebook(file_input) | |
# demo = build_gradio_interface() | |
# demo.launch() | |