File size: 6,025 Bytes
3658694
462fea8
3658694
 
462fea8
 
 
 
 
 
 
 
 
 
 
3658694
 
 
 
462fea8
 
 
 
 
 
 
 
 
3658694
462fea8
3658694
 
 
 
462fea8
 
 
3658694
462fea8
 
 
 
3658694
462fea8
3658694
462fea8
 
 
 
 
3658694
 
 
462fea8
 
 
 
 
 
 
 
 
 
 
 
3658694
 
 
 
462fea8
3658694
 
462fea8
3658694
 
 
 
 
 
 
462fea8
 
 
 
 
 
 
 
 
 
 
 
3658694
 
 
462fea8
3658694
 
462fea8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3658694
 
462fea8
 
 
3658694
462fea8
 
 
3658694
 
 
462fea8
3658694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462fea8
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import nbformat
import spacy
import gradio as gr
from transformers import pipeline
from tokenize import tokenize
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoConfig,
    pipeline,
    SummarizationPipeline,
)
import re

MODEL_NAME = "sagard21/python-code-explainer"


class NotebookEnhancer:
    def __init__(self):
        self.config = AutoConfig.from_pretrained(MODEL_NAME)
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding=True)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
        self.model.eval()
        self.pipeline = pipeline(
            "summarization",
            model=MODEL_NAME,
            config=self.config,
            tokenizer=self.tokenizer,
        )
        self.nlp = spacy.load("en_core_web_sm")

    def generate_title(self, code):
        """Generate a concise title for a code cell"""
        # Limit input length to match model constraints
        max_length = len(code) // 2
        print("Title Max length", max_length)

        truncated_code = code[:max_length] if len(code) > max_length else code
        max_length = len(truncated_code) // 2
        title = self.pipeline(code, min_length=5, max_length=30)[0][
            "summary_text"
        ].strip()

        print("Result title", title)
        # Format as a markdown title
        return f"# {title.capitalize()}"

    def _count_num_words(self, code):
        words = code.split(" ")
        return len(words)

    def generate_summary(self, code):
        """Generate a detailed summary for a code cell"""
        # result = self.pipeline([code], min_length=3, max_length=len(code // 2))
        print("Code", code)
        result = self.pipeline(code, min_length=5, max_length=30)
        print(result)
        summary = result[0]["summary_text"].strip()
        summary = self._postprocess_summary(summary)
        print("Result summary", summary)
        # print(self._is_valid_sentence_nlp(summary))
        # summary = result[0]["summary_text"].strip()
        return f"{summary}"

    def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
        """Add title and summary markdown cells before each code cell"""
        # Create a new notebook
        enhanced_notebook = nbformat.v4.new_notebook()
        enhanced_notebook.metadata = notebook.metadata
        print(len(notebook.cells))
        # Process each cell
        i = 0
        id = len(notebook.cells) + 1
        while i < len(notebook.cells):
            cell = notebook.cells[i]
            # For code cells, add title and summary markdown cells
            if cell.cell_type == "code" and cell.source.strip():
                # Generate summary
                summary = self.generate_summary(cell.source)
                summary_cell = nbformat.v4.new_markdown_cell(summary)
                summary_cell.outputs = []
                summary_cell.id = id
                id += 1

                # Generate title based on the summary cell
                title = self.generate_title(summary)
                title_cell = nbformat.v4.new_markdown_cell(title)
                title_cell.outputs = []
                title_cell.id = id
                id += 1

                enhanced_notebook.cells.append(title_cell)
                enhanced_notebook.cells.append(summary_cell)

            # Add the original cell
            cell.outputs = []
            enhanced_notebook.cells.append(cell)
            i += 1
        return enhanced_notebook

    def is_valid(self, words: list[str]):
        has_noun = False
        has_verb = False
        for word in words:
            if word.pos_ in ["NOUN", "PROPN", "PRON"]:
                has_noun = True
            if word.pos_ == "VERB":
                has_verb = True
        return has_noun and has_verb

    def _postprocess_summary(self, summary: str):
        doc = self.nlp(summary)
        sentences = list(doc.sents)
        # ignore the first sentence
        sentences = sentences[1:]
        # remove the trailing list enumeration
        postprocessed_sentences = []
        for sentence in sentences:
            if self.is_valid(sentence):
                postprocessed_sentences.append(sentence.text)
        return " ".join(postprocessed_sentences)


def process_notebook(file_path):
    """Process an uploaded notebook file"""
    enhancer = NotebookEnhancer()
    nb = None
    with open(file_path, "r", encoding="utf-8") as f:
        nb = nbformat.read(f, as_version=4)
    # Process the notebook
    enhanced_notebook = enhancer.enhance_notebook(nb)
    print(enhanced_notebook)
    enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
    # Save to temp file
    output_path = "enhanced_notebook.ipynb"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(enhanced_notebook_str)

    return output_path


def build_gradio_interface():
    """Create and launch the Gradio interface"""
    with gr.Blocks(title="Notebook Enhancer") as demo:
        gr.Markdown("# Jupyter Notebook Enhancer")
        gr.Markdown(
            """
        Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
        
        This tool uses Hugging Face models to:
        1. Generate concise titles for code cells
        2. Create explanatory summaries of what the code does
        """
        )

        with gr.Row():
            with gr.Column():
                file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
                process_btn = gr.Button("Enhance Notebook")

            with gr.Column():
                output = gr.File(label="Enhanced Notebook")

        process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)

    return demo


# This will be the entry point when running the script
if __name__ == "__main__":
    file_input = "my_notebook.json"
    test = process_notebook(file_input)
    # demo = build_gradio_interface()
    # demo.launch()