Spaces:
Sleeping
Sleeping
File size: 6,339 Bytes
3658694 462fea8 3658694 462fea8 57d40ed 462fea8 57d40ed 3658694 57d40ed 462fea8 57d40ed 3658694 57d40ed 462fea8 3658694 57d40ed 3658694 57d40ed 462fea8 3658694 57d40ed 462fea8 57d40ed 462fea8 3658694 462fea8 3658694 57d40ed 3658694 462fea8 3658694 462fea8 3658694 462fea8 57d40ed 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 7949eb2 3658694 57d40ed d062c90 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import nbformat
import spacy
import gradio as gr
from transformers import pipeline
from tokenize import tokenize
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoConfig,
pipeline,
)
import re
import nltk
PYTHON_CODE_MODEL = "sagard21/python-code-explainer"
TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation"
class NotebookEnhancer:
def __init__(self):
# models + tokenizer for generating titles from code summaries
self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL)
self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained(
TITLE_SUMMARIZE_MODEL
)
# models + tokenizer for generating summaries from Python code
self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL)
self.python_tokenizer = AutoTokenizer.from_pretrained(
PYTHON_CODE_MODEL, padding=True
)
self.python_pipeline = pipeline(
"summarization",
model=PYTHON_CODE_MODEL,
config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL),
tokenizer=self.python_tokenizer,
)
# initiate the language model
self.nlp = spacy.load("en_core_web_sm")
def generate_title(self, summary: str):
"""Generate a concise title for a code cell"""
inputs = self.title_tokenizer.batch_encode_plus(
["summarize: " + summary],
max_length=1024,
return_tensors="pt",
padding=True,
) # Batch size 1
output = self.title_summarization_model.generate(
**inputs, num_beams=8, do_sample=True, min_length=10, max_length=10
)
decoded_output = self.title_tokenizer.batch_decode(
output, skip_special_tokens=True
)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
return f"# {predicted_title}"
def _count_num_words(self, code):
words = code.split(" ")
return len(words)
def generate_summary(self, code):
"""Generate a detailed summary for a code cell"""
result = self.python_pipeline(code, min_length=5, max_length=64)
summary = result[0]["summary_text"].strip()
title, summary = self._postprocess_summary(summary)
return f"# {title}", f"{summary}"
def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
"""Add title and summary markdown cells before each code cell"""
# Create a new notebook
enhanced_notebook = nbformat.v4.new_notebook()
enhanced_notebook.metadata = notebook.metadata
# Process each cell
i = 0
id = len(notebook.cells) + 1
while i < len(notebook.cells):
cell = notebook.cells[i]
# For code cells, add title and summary markdown cells
if cell.cell_type == "code" and cell.source.strip():
# Generate summary
title, summary = self.generate_summary(cell.source)
summary_cell = nbformat.v4.new_markdown_cell(summary)
summary_cell.outputs = []
summary_cell.id = id
id += 1
title_cell = nbformat.v4.new_markdown_cell(title)
title_cell.outputs = []
title_cell.id = id
id += 1
enhanced_notebook.cells.append(title_cell)
enhanced_notebook.cells.append(summary_cell)
# Add the original cell
cell.outputs = []
enhanced_notebook.cells.append(cell)
i += 1
return enhanced_notebook
def is_valid(self, words: list[str]):
has_noun = False
has_verb = False
for word in words:
if word.pos_ in ["NOUN", "PROPN", "PRON"]:
has_noun = True
if word.pos_ == "VERB":
has_verb = True
return has_noun and has_verb
def _postprocess_summary(self, summary: str):
doc = self.nlp(summary)
sentences = list(doc.sents)
# remove the trailing list enumeration
postprocessed_sentences = []
for sentence in sentences:
if self.is_valid(sentence):
sentence_text = sentence.text
sentence_text = re.sub("[0-9]+\.", "", sentence_text)
postprocessed_sentences.append(sentence_text)
title = postprocessed_sentences[0]
summary = postprocessed_sentences[1:]
return title, " ".join(summary)
def process_notebook(file_path):
"""Process an uploaded notebook file"""
enhancer = NotebookEnhancer()
nb = None
with open(file_path, "r", encoding="utf-8") as f:
nb = nbformat.read(f, as_version=4)
# Process the notebook
enhanced_notebook = enhancer.enhance_notebook(nb)
enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
# Save to temp file
output_path = "enhanced_notebook.ipynb"
with open(output_path, "w", encoding="utf-8") as f:
f.write(enhanced_notebook_str)
return output_path
def build_gradio_interface():
"""Create and launch the Gradio interface"""
with gr.Blocks(title="Notebook Enhancer") as demo:
gr.Markdown("# Jupyter Notebook Enhancer")
gr.Markdown(
"""
Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
This tool uses Hugging Face models to:
1. Generate concise titles for code cells
2. Create explanatory summaries of what the code does
"""
)
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
print(file_input)
process_btn = gr.Button("Enhance Notebook")
with gr.Column():
output = gr.File(label="Enhanced Notebook")
process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)
return demo
# This will be the entry point when running the script
if __name__ == "__main__":
# file_input = "my_notebook.json"
# test = process_notebook(file_input)
demo = build_gradio_interface()
demo.launch(share=True)
|