irmchek commited on
Commit
462fea8
·
1 Parent(s): 3658694

prototype notebook summarizer

Browse files
Files changed (5) hide show
  1. my_notebook.json +173 -0
  2. notebook_enhancer.py +99 -47
  3. requirements.txt +178 -29
  4. test.ipynb +156 -0
  5. test.json +0 -0
my_notebook.json ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "id": 1,
7
+ "source": [
8
+ "# Data Science Analysis Notebook\n",
9
+ "\n",
10
+ "This notebook contains some example Python code for data analysis."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "id": 2,
18
+ "source": [
19
+ "# Import libraries\n",
20
+ "import pandas as pd\n",
21
+ "import numpy as np\n",
22
+ "import matplotlib.pyplot as plt\n",
23
+ "import seaborn as sns\n",
24
+ "\n",
25
+ "# Set visualization style\n",
26
+ "sns.set(style='whitegrid')\n",
27
+ "%matplotlib inline"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": null,
33
+ "metadata": {},
34
+ "id": 3,
35
+ "source": [
36
+ "# Load the dataset\n",
37
+ "df = pd.read_csv('housing_data.csv')\n",
38
+ "\n",
39
+ "# Display basic information\n",
40
+ "print(f\"Dataset shape: {df.shape}\")\n",
41
+ "df.head()"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "metadata": {},
48
+ "id": 4,
49
+ "source": [
50
+ "# Perform data cleaning\n",
51
+ "# Fill missing values with median\n",
52
+ "for column in df.columns:\n",
53
+ " if df[column].dtype in ['float64', 'int64']:\n",
54
+ " df[column].fillna(df[column].median(), inplace=True)\n",
55
+ " else:\n",
56
+ " df[column].fillna(df[column].mode()[0], inplace=True)\n",
57
+ "\n",
58
+ "# Check for remaining missing values\n",
59
+ "print(\"Missing values after cleaning:\")\n",
60
+ "print(df.isnull().sum())"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "metadata": {},
67
+ "id": 5,
68
+ "source": [
69
+ "# Exploratory data analysis\n",
70
+ "# Create correlation matrix\n",
71
+ "numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
72
+ "correlation_matrix = df[numeric_columns].corr()\n",
73
+ "\n",
74
+ "# Plot heatmap\n",
75
+ "plt.figure(figsize=(12, 10))\n",
76
+ "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)\n",
77
+ "plt.title('Correlation Matrix of Numeric Features', fontsize=18)\n",
78
+ "plt.xticks(rotation=45, ha='right')\n",
79
+ "plt.tight_layout()\n",
80
+ "plt.show()"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "metadata": {},
87
+ "id": 6,
88
+ "source": [
89
+ "# Feature engineering\n",
90
+ "# Create new features\n",
91
+ "if 'bedrooms' in df.columns and 'total_rooms' in df.columns:\n",
92
+ " df['bedrooms_ratio'] = df['bedrooms'] / df['total_rooms']\n",
93
+ "\n",
94
+ "if 'total_rooms' in df.columns and 'households' in df.columns:\n",
95
+ " df['rooms_per_household'] = df['total_rooms'] / df['households']\n",
96
+ "\n",
97
+ "# Scale numeric features\n",
98
+ "from sklearn.preprocessing import StandardScaler\n",
99
+ "scaler = StandardScaler()\n",
100
+ "df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n",
101
+ "\n",
102
+ "# Display transformed data\n",
103
+ "df.head()"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "metadata": {},
110
+ "id": 7,
111
+ "source": [
112
+ "# Build a simple prediction model\n",
113
+ "from sklearn.model_selection import train_test_split\n",
114
+ "from sklearn.linear_model import LinearRegression\n",
115
+ "from sklearn.metrics import mean_squared_error, r2_score\n",
116
+ "\n",
117
+ "# Assume we're predicting median_house_value\n",
118
+ "if 'median_house_value' in df.columns:\n",
119
+ " # Prepare features and target\n",
120
+ " X = df.drop('median_house_value', axis=1)\n",
121
+ " y = df['median_house_value']\n",
122
+ " \n",
123
+ " # Split the data\n",
124
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
125
+ " \n",
126
+ " # Train the model\n",
127
+ " model = LinearRegression()\n",
128
+ " model.fit(X_train, y_train)\n",
129
+ " \n",
130
+ " # Make predictions\n",
131
+ " y_pred = model.predict(X_test)\n",
132
+ " \n",
133
+ " # Evaluate the model\n",
134
+ " mse = mean_squared_error(y_test, y_pred)\n",
135
+ " r2 = r2_score(y_test, y_pred)\n",
136
+ " \n",
137
+ " print(f\"Mean Squared Error: {mse:.2f}\")\n",
138
+ " print(f\"R² Score: {r2:.2f}\")\n",
139
+ " \n",
140
+ " # Plot actual vs predicted values\n",
141
+ " plt.figure(figsize=(10, 6))\n",
142
+ " plt.scatter(y_test, y_pred, alpha=0.5)\n",
143
+ " plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')\n",
144
+ " plt.xlabel('Actual Values')\n",
145
+ " plt.ylabel('Predicted Values')\n",
146
+ " plt.title('Actual vs Predicted Values')\n",
147
+ " plt.tight_layout()\n",
148
+ " plt.show()"
149
+ ]
150
+ }
151
+ ],
152
+ "metadata": {
153
+ "kernelspec": {
154
+ "display_name": "Python 3",
155
+ "language": "python",
156
+ "name": "python3"
157
+ },
158
+ "language_info": {
159
+ "codemirror_mode": {
160
+ "name": "ipython",
161
+ "version": 3
162
+ },
163
+ "file_extension": ".py",
164
+ "mimetype": "text/x-python",
165
+ "name": "python",
166
+ "nbconvert_exporter": "python",
167
+ "pygments_lexer": "ipython3",
168
+ "version": "3.8.10"
169
+ }
170
+ },
171
+ "nbformat": 4,
172
+ "nbformat_minor": 4
173
+ }
notebook_enhancer.py CHANGED
@@ -1,90 +1,140 @@
1
  import nbformat
 
2
  import gradio as gr
3
  from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  class NotebookEnhancer:
7
  def __init__(self):
8
- # Initialize Hugging Face models
9
- self.title_generator = pipeline(
10
- "summarization", model="facebook/bart-large-cnn"
11
- )
12
- self.summary_generator = pipeline(
13
- "summarization", model="sshleifer/distilbart-cnn-12-6"
 
 
 
14
  )
 
15
 
16
  def generate_title(self, code):
17
  """Generate a concise title for a code cell"""
18
  # Limit input length to match model constraints
19
- max_length = 1024
 
 
20
  truncated_code = code[:max_length] if len(code) > max_length else code
 
 
 
 
21
 
22
- result = self.title_generator(
23
- truncated_code, max_length=10, min_length=3, do_sample=False
24
- )
25
- title = result[0]["summary_text"].strip()
26
  # Format as a markdown title
27
- return f"## {title.capitalize()}"
 
 
 
 
28
 
29
  def generate_summary(self, code):
30
  """Generate a detailed summary for a code cell"""
31
- # Limit input length to match model constraints
32
- max_length = 1024
33
- truncated_code = code[:max_length] if len(code) > max_length else code
34
-
35
- result = self.summary_generator(
36
- truncated_code, max_length=100, min_length=30, do_sample=True
37
- )
38
- return result[0]["summary_text"].strip()
39
-
40
- def enhance_notebook(self, notebook_content):
 
 
41
  """Add title and summary markdown cells before each code cell"""
42
- # Load the notebook
43
- notebook = nbformat.reads(notebook_content, as_version=4)
44
-
45
  # Create a new notebook
46
  enhanced_notebook = nbformat.v4.new_notebook()
47
  enhanced_notebook.metadata = notebook.metadata
48
-
49
  # Process each cell
50
  i = 0
 
51
  while i < len(notebook.cells):
52
  cell = notebook.cells[i]
53
-
54
  # For code cells, add title and summary markdown cells
55
  if cell.cell_type == "code" and cell.source.strip():
56
- # Generate title
57
- title = self.generate_title(cell.source)
58
- title_cell = nbformat.v4.new_markdown_cell(title)
59
- enhanced_notebook.cells.append(title_cell)
60
-
61
  # Generate summary
62
  summary = self.generate_summary(cell.source)
63
  summary_cell = nbformat.v4.new_markdown_cell(summary)
 
 
 
 
 
 
 
 
 
 
 
 
64
  enhanced_notebook.cells.append(summary_cell)
65
 
66
  # Add the original cell
 
67
  enhanced_notebook.cells.append(cell)
68
  i += 1
69
-
70
- # Convert back to string
71
- return nbformat.writes(enhanced_notebook)
72
-
73
-
74
- def process_notebook(file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  """Process an uploaded notebook file"""
76
  enhancer = NotebookEnhancer()
77
-
78
- # Read uploaded file
79
- notebook_content = file.decode("utf-8")
80
-
81
  # Process the notebook
82
- enhanced_notebook = enhancer.enhance_notebook(notebook_content)
83
-
 
84
  # Save to temp file
85
  output_path = "enhanced_notebook.ipynb"
86
  with open(output_path, "w", encoding="utf-8") as f:
87
- f.write(enhanced_notebook)
88
 
89
  return output_path
90
 
@@ -118,5 +168,7 @@ def build_gradio_interface():
118
 
119
  # This will be the entry point when running the script
120
  if __name__ == "__main__":
121
- demo = build_gradio_interface()
122
- demo.launch()
 
 
 
1
  import nbformat
2
+ import spacy
3
  import gradio as gr
4
  from transformers import pipeline
5
+ from tokenize import tokenize
6
+ from transformers import (
7
+ AutoModelForSeq2SeqLM,
8
+ AutoTokenizer,
9
+ AutoConfig,
10
+ pipeline,
11
+ SummarizationPipeline,
12
+ )
13
+ import re
14
+
15
+ MODEL_NAME = "sagard21/python-code-explainer"
16
 
17
 
18
  class NotebookEnhancer:
19
  def __init__(self):
20
+ self.config = AutoConfig.from_pretrained(MODEL_NAME)
21
+ self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding=True)
22
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
23
+ self.model.eval()
24
+ self.pipeline = pipeline(
25
+ "summarization",
26
+ model=MODEL_NAME,
27
+ config=self.config,
28
+ tokenizer=self.tokenizer,
29
  )
30
+ self.nlp = spacy.load("en_core_web_sm")
31
 
32
  def generate_title(self, code):
33
  """Generate a concise title for a code cell"""
34
  # Limit input length to match model constraints
35
+ max_length = len(code) // 2
36
+ print("Title Max length", max_length)
37
+
38
  truncated_code = code[:max_length] if len(code) > max_length else code
39
+ max_length = len(truncated_code) // 2
40
+ title = self.pipeline(code, min_length=5, max_length=30)[0][
41
+ "summary_text"
42
+ ].strip()
43
 
44
+ print("Result title", title)
 
 
 
45
  # Format as a markdown title
46
+ return f"# {title.capitalize()}"
47
+
48
+ def _count_num_words(self, code):
49
+ words = code.split(" ")
50
+ return len(words)
51
 
52
  def generate_summary(self, code):
53
  """Generate a detailed summary for a code cell"""
54
+ # result = self.pipeline([code], min_length=3, max_length=len(code // 2))
55
+ print("Code", code)
56
+ result = self.pipeline(code, min_length=5, max_length=30)
57
+ print(result)
58
+ summary = result[0]["summary_text"].strip()
59
+ summary = self._postprocess_summary(summary)
60
+ print("Result summary", summary)
61
+ # print(self._is_valid_sentence_nlp(summary))
62
+ # summary = result[0]["summary_text"].strip()
63
+ return f"{summary}"
64
+
65
+ def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
66
  """Add title and summary markdown cells before each code cell"""
 
 
 
67
  # Create a new notebook
68
  enhanced_notebook = nbformat.v4.new_notebook()
69
  enhanced_notebook.metadata = notebook.metadata
70
+ print(len(notebook.cells))
71
  # Process each cell
72
  i = 0
73
+ id = len(notebook.cells) + 1
74
  while i < len(notebook.cells):
75
  cell = notebook.cells[i]
 
76
  # For code cells, add title and summary markdown cells
77
  if cell.cell_type == "code" and cell.source.strip():
 
 
 
 
 
78
  # Generate summary
79
  summary = self.generate_summary(cell.source)
80
  summary_cell = nbformat.v4.new_markdown_cell(summary)
81
+ summary_cell.outputs = []
82
+ summary_cell.id = id
83
+ id += 1
84
+
85
+ # Generate title based on the summary cell
86
+ title = self.generate_title(summary)
87
+ title_cell = nbformat.v4.new_markdown_cell(title)
88
+ title_cell.outputs = []
89
+ title_cell.id = id
90
+ id += 1
91
+
92
+ enhanced_notebook.cells.append(title_cell)
93
  enhanced_notebook.cells.append(summary_cell)
94
 
95
  # Add the original cell
96
+ cell.outputs = []
97
  enhanced_notebook.cells.append(cell)
98
  i += 1
99
+ return enhanced_notebook
100
+
101
+ def is_valid(self, words: list[str]):
102
+ has_noun = False
103
+ has_verb = False
104
+ for word in words:
105
+ if word.pos_ in ["NOUN", "PROPN", "PRON"]:
106
+ has_noun = True
107
+ if word.pos_ == "VERB":
108
+ has_verb = True
109
+ return has_noun and has_verb
110
+
111
+ def _postprocess_summary(self, summary: str):
112
+ doc = self.nlp(summary)
113
+ sentences = list(doc.sents)
114
+ # ignore the first sentence
115
+ sentences = sentences[1:]
116
+ # remove the trailing list enumeration
117
+ postprocessed_sentences = []
118
+ for sentence in sentences:
119
+ if self.is_valid(sentence):
120
+ postprocessed_sentences.append(sentence.text)
121
+ return " ".join(postprocessed_sentences)
122
+
123
+
124
+ def process_notebook(file_path):
125
  """Process an uploaded notebook file"""
126
  enhancer = NotebookEnhancer()
127
+ nb = None
128
+ with open(file_path, "r", encoding="utf-8") as f:
129
+ nb = nbformat.read(f, as_version=4)
 
130
  # Process the notebook
131
+ enhanced_notebook = enhancer.enhance_notebook(nb)
132
+ print(enhanced_notebook)
133
+ enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
134
  # Save to temp file
135
  output_path = "enhanced_notebook.ipynb"
136
  with open(output_path, "w", encoding="utf-8") as f:
137
+ f.write(enhanced_notebook_str)
138
 
139
  return output_path
140
 
 
168
 
169
  # This will be the entry point when running the script
170
  if __name__ == "__main__":
171
+ file_input = "my_notebook.json"
172
+ test = process_notebook(file_input)
173
+ # demo = build_gradio_interface()
174
+ # demo.launch()
requirements.txt CHANGED
@@ -1,29 +1,178 @@
1
- # Core dependencies
2
- nbformat>=5.1.3
3
- gradio>=3.32.0
4
- transformers>=4.26.0
5
- torch>=1.13.1
6
- accelerate>=0.16.0
7
-
8
- # NLP model dependencies
9
- sentencepiece>=0.1.97
10
- protobuf>=3.20.0
11
-
12
- # Notebook dependencies
13
- jupyterlab>=3.5.0
14
- ipykernel>=6.21.0
15
-
16
- # Visualization (for example notebooks)
17
- matplotlib>=3.6.0
18
- seaborn>=0.12.0
19
- pandas>=1.5.0
20
- numpy>=1.23.0
21
-
22
- # ML components (for example notebooks)
23
- scikit-learn>=1.2.0
24
-
25
- # Development tools
26
- pytest>=7.2.0
27
- black>=23.1.0
28
- flake8>=6.0.0
29
- isort>=5.12.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.6.0
2
+ aiofiles==24.1.0
3
+ annotated-types==0.7.0
4
+ anyio==4.9.0
5
+ appnope==0.1.4
6
+ argon2-cffi==23.1.0
7
+ argon2-cffi-bindings==21.2.0
8
+ arrow==1.3.0
9
+ asttokens==3.0.0
10
+ async-lru==2.0.5
11
+ attrs==25.3.0
12
+ babel==2.17.0
13
+ beautifulsoup4==4.13.4
14
+ black==25.1.0
15
+ bleach==6.2.0
16
+ blis==1.3.0
17
+ catalogue==2.0.10
18
+ certifi==2025.1.31
19
+ cffi==1.17.1
20
+ charset-normalizer==3.4.1
21
+ click==8.1.8
22
+ cloudpathlib==0.21.0
23
+ comm==0.2.2
24
+ confection==0.1.5
25
+ contourpy==1.3.2
26
+ cycler==0.12.1
27
+ cymem==2.0.11
28
+ debugpy==1.8.14
29
+ decorator==5.2.1
30
+ defusedxml==0.7.1
31
+ executing==2.2.0
32
+ fastapi==0.115.12
33
+ fastjsonschema==2.21.1
34
+ ffmpy==0.5.0
35
+ filelock==3.18.0
36
+ flake8==7.2.0
37
+ fonttools==4.57.0
38
+ fqdn==1.5.1
39
+ fsspec==2025.3.2
40
+ gradio==5.25.2
41
+ gradio_client==1.8.0
42
+ groovy==0.1.2
43
+ h11==0.14.0
44
+ httpcore==1.0.8
45
+ httpx==0.28.1
46
+ huggingface-hub==0.30.2
47
+ idna==3.10
48
+ iniconfig==2.1.0
49
+ ipykernel==6.29.5
50
+ ipython==9.1.0
51
+ ipython_pygments_lexers==1.1.1
52
+ isoduration==20.11.0
53
+ isort==6.0.1
54
+ jedi==0.19.2
55
+ Jinja2==3.1.6
56
+ joblib==1.4.2
57
+ json5==0.12.0
58
+ jsonpointer==3.0.0
59
+ jsonschema==4.23.0
60
+ jsonschema-specifications==2024.10.1
61
+ jupyter-events==0.12.0
62
+ jupyter-lsp==2.2.5
63
+ jupyter_client==8.6.3
64
+ jupyter_core==5.7.2
65
+ jupyter_server==2.15.0
66
+ jupyter_server_terminals==0.5.3
67
+ jupyterlab==4.4.0
68
+ jupyterlab_pygments==0.3.0
69
+ jupyterlab_server==2.27.3
70
+ kiwisolver==1.4.8
71
+ langcodes==3.5.0
72
+ language_data==1.3.0
73
+ marisa-trie==1.2.1
74
+ markdown-it-py==3.0.0
75
+ MarkupSafe==3.0.2
76
+ matplotlib==3.10.1
77
+ matplotlib-inline==0.1.7
78
+ mccabe==0.7.0
79
+ mdurl==0.1.2
80
+ mistune==3.1.3
81
+ mpmath==1.3.0
82
+ murmurhash==1.0.12
83
+ mypy-extensions==1.0.0
84
+ nbclient==0.10.2
85
+ nbconvert==7.16.6
86
+ nbformat==5.10.4
87
+ nest-asyncio==1.6.0
88
+ networkx==3.4.2
89
+ notebook_shim==0.2.4
90
+ numpy==2.2.4
91
+ orjson==3.10.16
92
+ overrides==7.7.0
93
+ packaging==24.2
94
+ pandas==2.2.3
95
+ pandocfilters==1.5.1
96
+ parso==0.8.4
97
+ pathspec==0.12.1
98
+ pexpect==4.9.0
99
+ pillow==11.2.1
100
+ platformdirs==4.3.7
101
+ pluggy==1.5.0
102
+ preshed==3.0.9
103
+ prometheus_client==0.21.1
104
+ prompt_toolkit==3.0.51
105
+ protobuf==6.30.2
106
+ psutil==7.0.0
107
+ ptyprocess==0.7.0
108
+ pure_eval==0.2.3
109
+ pycodestyle==2.13.0
110
+ pycparser==2.22
111
+ pydantic==2.11.3
112
+ pydantic_core==2.33.1
113
+ pydub==0.25.1
114
+ pyflakes==3.3.2
115
+ Pygments==2.19.1
116
+ pyparsing==3.2.3
117
+ pytest==8.3.5
118
+ python-dateutil==2.9.0.post0
119
+ python-json-logger==3.3.0
120
+ python-multipart==0.0.20
121
+ pytz==2025.2
122
+ PyYAML==6.0.2
123
+ pyzmq==26.4.0
124
+ referencing==0.36.2
125
+ regex==2024.11.6
126
+ requests==2.32.3
127
+ rfc3339-validator==0.1.4
128
+ rfc3986-validator==0.1.1
129
+ rich==14.0.0
130
+ rpds-py==0.24.0
131
+ ruff==0.11.5
132
+ safehttpx==0.1.6
133
+ safetensors==0.5.3
134
+ scikit-learn==1.6.1
135
+ scipy==1.15.2
136
+ seaborn==0.13.2
137
+ semantic-version==2.10.0
138
+ Send2Trash==1.8.3
139
+ sentencepiece==0.2.0
140
+ shellingham==1.5.4
141
+ six==1.17.0
142
+ smart-open==7.1.0
143
+ sniffio==1.3.1
144
+ soupsieve==2.6
145
+ spacy==3.8.5
146
+ spacy-legacy==3.0.12
147
+ spacy-loggers==1.0.5
148
+ srsly==2.5.1
149
+ stack-data==0.6.3
150
+ starlette==0.46.2
151
+ sympy==1.13.1
152
+ terminado==0.18.1
153
+ thinc==8.3.6
154
+ threadpoolctl==3.6.0
155
+ tinycss2==1.4.0
156
+ tokenizers==0.21.1
157
+ tomlkit==0.13.2
158
+ torch==2.6.0
159
+ tornado==6.4.2
160
+ tqdm==4.67.1
161
+ traitlets==5.14.3
162
+ transformers==4.51.3
163
+ typer==0.15.2
164
+ types-python-dateutil==2.9.0.20241206
165
+ typing-inspection==0.4.0
166
+ typing_extensions==4.13.2
167
+ tzdata==2025.2
168
+ uri-template==1.3.0
169
+ urllib3==2.4.0
170
+ uvicorn==0.34.1
171
+ wasabi==1.1.3
172
+ wcwidth==0.2.13
173
+ weasel==0.4.1
174
+ webcolors==24.11.1
175
+ webencodings==0.5.1
176
+ websocket-client==1.8.0
177
+ websockets==15.0.1
178
+ wrapt==1.17.2
test.ipynb ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from tokenize import tokenize\n",
10
+ "from io import BytesIO\n",
11
+ "\n",
12
+ "code = \"\"\"import nltk\n",
13
+ " from nltk.stem import PorterStemmer\n",
14
+ " porter_stemmer=PorterStemmer()\n",
15
+ " words=[\"connect\",\"connected\",\"connection\",\"connections\",\"connects\"]\n",
16
+ " stemmed_words=[porter_stemmer.stem(word) for word in words]\n",
17
+ " stemmed_words\"\"\"\n",
18
+ " \n",
19
+ "for tok in tokenize(BytesIO(code.encode('utf-8')).readline):\n",
20
+ " print(f\"Type: {tok.type}\\nString: {tok.string}\\nStart: {tok.start}\\nEnd: {tok.end}\\nLine: {tok.line.strip()}\\n======\\n\")\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 40,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "['Create a function to summarize the data.', 'For each column in the dataframe, create a correlation matrix.', '3']\n"
33
+ ]
34
+ }
35
+ ],
36
+ "source": [
37
+ "import re\n",
38
+ "my_summary = '\\n1. Create a function to summarize the code.\\n2. At first, we will start by importing the pandas and numpy modules.'.strip()\n",
39
+ "my_summary = 'Create a function summarize and load the dataset.\\n1. To Load the dataset\\n2. To display the basic information\\n3.'.strip()\n",
40
+ "my_summary = '\\n1. Create a function to summarize the data.\\n2. For each column in the dataframe, create a correlation matrix.\\n3'\n",
41
+ "my_symmary = \"\\n1. Create a function to summarize the code.\\n2. At first, we will start by importing the pandas and numpy modules.\"\n",
42
+ "sentences = my_summary.split('\\n')[1:]\n",
43
+ "#remove the trailing list enumeration\n",
44
+ "new_sentences = []\n",
45
+ "for sentence in sentences:\n",
46
+ " new_sentences.append(re.sub(\"[0-9]+\\.\\s\", \"\", sentence))\n",
47
+ "print(new_sentences)"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 42,
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "name": "stdout",
57
+ "output_type": "stream",
58
+ "text": [
59
+ "\n",
60
+ "1. Create a function to summarize the data.\n",
61
+ "2.\n",
62
+ "the sentence is valid? True\n",
63
+ "\n",
64
+ " False SPACE\n",
65
+ "1 False X\n",
66
+ ". False PUNCT\n",
67
+ "Create True VERB\n",
68
+ "a True DET\n",
69
+ "function True NOUN\n",
70
+ "to True PART\n",
71
+ "summarize True VERB\n",
72
+ "the True DET\n",
73
+ "data True NOUN\n",
74
+ ". False PUNCT\n",
75
+ "\n",
76
+ " False SPACE\n",
77
+ "2 False X\n",
78
+ ". False PUNCT\n",
79
+ "For each column in the dataframe, create a correlation matrix.\n",
80
+ "\n",
81
+ "the sentence is valid? True\n",
82
+ "For True ADP\n",
83
+ "each True DET\n",
84
+ "column True NOUN\n",
85
+ "in True ADP\n",
86
+ "the True DET\n",
87
+ "dataframe True NOUN\n",
88
+ ", False PUNCT\n",
89
+ "create True VERB\n",
90
+ "a True DET\n",
91
+ "correlation True NOUN\n",
92
+ "matrix True NOUN\n",
93
+ ". False PUNCT\n",
94
+ "\n",
95
+ " False SPACE\n",
96
+ "3\n",
97
+ "the sentence is valid? False\n",
98
+ "3 False NUM\n"
99
+ ]
100
+ }
101
+ ],
102
+ "source": [
103
+ "import spacy\n",
104
+ "nlp = spacy.load(\"en_core_web_sm\")\n",
105
+ "\n",
106
+ "\n",
107
+ "def is_valid(words: list[str]):\n",
108
+ " has_noun = False\n",
109
+ " has_verb = False\n",
110
+ " for word in words: \n",
111
+ " if word.pos_ in ['NOUN', 'PROPN', 'PRON']:\n",
112
+ " has_noun = True\n",
113
+ " if word.pos_ == 'VERB':\n",
114
+ " has_verb = True\n",
115
+ " return has_noun and has_verb\n",
116
+ "\n",
117
+ "doc = nlp(my_summary)\n",
118
+ "sentences = list(doc.sents)\n",
119
+ "\n",
120
+ "for sentence in sentences:\n",
121
+ " print(sentence)\n",
122
+ " print(\"the sentence is valid?\", is_valid(sentence))\n",
123
+ " for word in sentence:\n",
124
+ " print(word, word.is_alpha, word.pos_)\n"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": []
133
+ }
134
+ ],
135
+ "metadata": {
136
+ "kernelspec": {
137
+ "display_name": ".venv",
138
+ "language": "python",
139
+ "name": "python3"
140
+ },
141
+ "language_info": {
142
+ "codemirror_mode": {
143
+ "name": "ipython",
144
+ "version": 3
145
+ },
146
+ "file_extension": ".py",
147
+ "mimetype": "text/x-python",
148
+ "name": "python",
149
+ "nbconvert_exporter": "python",
150
+ "pygments_lexer": "ipython3",
151
+ "version": "3.11.2"
152
+ }
153
+ },
154
+ "nbformat": 4,
155
+ "nbformat_minor": 2
156
+ }
test.json ADDED
File without changes