Spaces:

irmchek
/

mynotebooksummary

Sleeping

File size: 6,025 Bytes

import nbformat
import spacy
import gradio as gr
from transformers import pipeline
from tokenize import tokenize
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoConfig,
    pipeline,
    SummarizationPipeline,
)
import re

MODEL_NAME = "sagard21/python-code-explainer"


class NotebookEnhancer:
    def __init__(self):
        self.config = AutoConfig.from_pretrained(MODEL_NAME)
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding=True)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
        self.model.eval()
        self.pipeline = pipeline(
            "summarization",
            model=MODEL_NAME,
            config=self.config,
            tokenizer=self.tokenizer,
        )
        self.nlp = spacy.load("en_core_web_sm")

    def generate_title(self, code):
        """Generate a concise title for a code cell"""
        # Limit input length to match model constraints
        max_length = len(code) // 2
        print("Title Max length", max_length)

        truncated_code = code[:max_length] if len(code) > max_length else code
        max_length = len(truncated_code) // 2
        title = self.pipeline(code, min_length=5, max_length=30)[0][
            "summary_text"
        ].strip()

        print("Result title", title)
        # Format as a markdown title
        return f"# {title.capitalize()}"

    def _count_num_words(self, code):
        words = code.split(" ")
        return len(words)

    def generate_summary(self, code):
        """Generate a detailed summary for a code cell"""
        # result = self.pipeline([code], min_length=3, max_length=len(code // 2))
        print("Code", code)
        result = self.pipeline(code, min_length=5, max_length=30)
        print(result)
        summary = result[0]["summary_text"].strip()
        summary = self._postprocess_summary(summary)
        print("Result summary", summary)
        # print(self._is_valid_sentence_nlp(summary))
        # summary = result[0]["summary_text"].strip()
        return f"{summary}"

    def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
        """Add title and summary markdown cells before each code cell"""
        # Create a new notebook
        enhanced_notebook = nbformat.v4.new_notebook()
        enhanced_notebook.metadata = notebook.metadata
        print(len(notebook.cells))
        # Process each cell
        i = 0
        id = len(notebook.cells) + 1
        while i < len(notebook.cells):
            cell = notebook.cells[i]
            # For code cells, add title and summary markdown cells
            if cell.cell_type == "code" and cell.source.strip():
                # Generate summary
                summary = self.generate_summary(cell.source)
                summary_cell = nbformat.v4.new_markdown_cell(summary)
                summary_cell.outputs = []
                summary_cell.id = id
                id += 1

                # Generate title based on the summary cell
                title = self.generate_title(summary)
                title_cell = nbformat.v4.new_markdown_cell(title)
                title_cell.outputs = []
                title_cell.id = id
                id += 1

                enhanced_notebook.cells.append(title_cell)
                enhanced_notebook.cells.append(summary_cell)

            # Add the original cell
            cell.outputs = []
            enhanced_notebook.cells.append(cell)
            i += 1
        return enhanced_notebook

    def is_valid(self, words: list[str]):
        has_noun = False
        has_verb = False
        for word in words:
            if word.pos_ in ["NOUN", "PROPN", "PRON"]:
                has_noun = True
            if word.pos_ == "VERB":
                has_verb = True
        return has_noun and has_verb

    def _postprocess_summary(self, summary: str):
        doc = self.nlp(summary)
        sentences = list(doc.sents)
        # ignore the first sentence
        sentences = sentences[1:]
        # remove the trailing list enumeration
        postprocessed_sentences = []
        for sentence in sentences:
            if self.is_valid(sentence):
                postprocessed_sentences.append(sentence.text)
        return " ".join(postprocessed_sentences)


def process_notebook(file_path):
    """Process an uploaded notebook file"""
    enhancer = NotebookEnhancer()
    nb = None
    with open(file_path, "r", encoding="utf-8") as f:
        nb = nbformat.read(f, as_version=4)
    # Process the notebook
    enhanced_notebook = enhancer.enhance_notebook(nb)
    print(enhanced_notebook)
    enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
    # Save to temp file
    output_path = "enhanced_notebook.ipynb"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(enhanced_notebook_str)

    return output_path


def build_gradio_interface():
    """Create and launch the Gradio interface"""
    with gr.Blocks(title="Notebook Enhancer") as demo:
        gr.Markdown("# Jupyter Notebook Enhancer")
        gr.Markdown(
            """
        Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
        
        This tool uses Hugging Face models to:
        1. Generate concise titles for code cells
        2. Create explanatory summaries of what the code does
        """
        )

        with gr.Row():
            with gr.Column():
                file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
                process_btn = gr.Button("Enhance Notebook")

            with gr.Column():
                output = gr.File(label="Enhanced Notebook")

        process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)

    return demo


# This will be the entry point when running the script
if __name__ == "__main__":
    file_input = "my_notebook.json"
    test = process_notebook(file_input)
    # demo = build_gradio_interface()
    # demo.launch()