Spaces:
Sleeping
Sleeping
File size: 6,025 Bytes
3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 3658694 462fea8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import nbformat
import spacy
import gradio as gr
from transformers import pipeline
from tokenize import tokenize
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoConfig,
pipeline,
SummarizationPipeline,
)
import re
MODEL_NAME = "sagard21/python-code-explainer"
class NotebookEnhancer:
def __init__(self):
self.config = AutoConfig.from_pretrained(MODEL_NAME)
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding=True)
self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
self.model.eval()
self.pipeline = pipeline(
"summarization",
model=MODEL_NAME,
config=self.config,
tokenizer=self.tokenizer,
)
self.nlp = spacy.load("en_core_web_sm")
def generate_title(self, code):
"""Generate a concise title for a code cell"""
# Limit input length to match model constraints
max_length = len(code) // 2
print("Title Max length", max_length)
truncated_code = code[:max_length] if len(code) > max_length else code
max_length = len(truncated_code) // 2
title = self.pipeline(code, min_length=5, max_length=30)[0][
"summary_text"
].strip()
print("Result title", title)
# Format as a markdown title
return f"# {title.capitalize()}"
def _count_num_words(self, code):
words = code.split(" ")
return len(words)
def generate_summary(self, code):
"""Generate a detailed summary for a code cell"""
# result = self.pipeline([code], min_length=3, max_length=len(code // 2))
print("Code", code)
result = self.pipeline(code, min_length=5, max_length=30)
print(result)
summary = result[0]["summary_text"].strip()
summary = self._postprocess_summary(summary)
print("Result summary", summary)
# print(self._is_valid_sentence_nlp(summary))
# summary = result[0]["summary_text"].strip()
return f"{summary}"
def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
"""Add title and summary markdown cells before each code cell"""
# Create a new notebook
enhanced_notebook = nbformat.v4.new_notebook()
enhanced_notebook.metadata = notebook.metadata
print(len(notebook.cells))
# Process each cell
i = 0
id = len(notebook.cells) + 1
while i < len(notebook.cells):
cell = notebook.cells[i]
# For code cells, add title and summary markdown cells
if cell.cell_type == "code" and cell.source.strip():
# Generate summary
summary = self.generate_summary(cell.source)
summary_cell = nbformat.v4.new_markdown_cell(summary)
summary_cell.outputs = []
summary_cell.id = id
id += 1
# Generate title based on the summary cell
title = self.generate_title(summary)
title_cell = nbformat.v4.new_markdown_cell(title)
title_cell.outputs = []
title_cell.id = id
id += 1
enhanced_notebook.cells.append(title_cell)
enhanced_notebook.cells.append(summary_cell)
# Add the original cell
cell.outputs = []
enhanced_notebook.cells.append(cell)
i += 1
return enhanced_notebook
def is_valid(self, words: list[str]):
has_noun = False
has_verb = False
for word in words:
if word.pos_ in ["NOUN", "PROPN", "PRON"]:
has_noun = True
if word.pos_ == "VERB":
has_verb = True
return has_noun and has_verb
def _postprocess_summary(self, summary: str):
doc = self.nlp(summary)
sentences = list(doc.sents)
# ignore the first sentence
sentences = sentences[1:]
# remove the trailing list enumeration
postprocessed_sentences = []
for sentence in sentences:
if self.is_valid(sentence):
postprocessed_sentences.append(sentence.text)
return " ".join(postprocessed_sentences)
def process_notebook(file_path):
"""Process an uploaded notebook file"""
enhancer = NotebookEnhancer()
nb = None
with open(file_path, "r", encoding="utf-8") as f:
nb = nbformat.read(f, as_version=4)
# Process the notebook
enhanced_notebook = enhancer.enhance_notebook(nb)
print(enhanced_notebook)
enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
# Save to temp file
output_path = "enhanced_notebook.ipynb"
with open(output_path, "w", encoding="utf-8") as f:
f.write(enhanced_notebook_str)
return output_path
def build_gradio_interface():
"""Create and launch the Gradio interface"""
with gr.Blocks(title="Notebook Enhancer") as demo:
gr.Markdown("# Jupyter Notebook Enhancer")
gr.Markdown(
"""
Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
This tool uses Hugging Face models to:
1. Generate concise titles for code cells
2. Create explanatory summaries of what the code does
"""
)
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
process_btn = gr.Button("Enhance Notebook")
with gr.Column():
output = gr.File(label="Enhanced Notebook")
process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)
return demo
# This will be the entry point when running the script
if __name__ == "__main__":
file_input = "my_notebook.json"
test = process_notebook(file_input)
# demo = build_gradio_interface()
# demo.launch()
|