Spaces:
Sleeping
Sleeping
prototype notebook summarizer
Browse files- my_notebook.json +173 -0
- notebook_enhancer.py +99 -47
- requirements.txt +178 -29
- test.ipynb +156 -0
- test.json +0 -0
my_notebook.json
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"id": 1,
|
7 |
+
"source": [
|
8 |
+
"# Data Science Analysis Notebook\n",
|
9 |
+
"\n",
|
10 |
+
"This notebook contains some example Python code for data analysis."
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": null,
|
16 |
+
"metadata": {},
|
17 |
+
"id": 2,
|
18 |
+
"source": [
|
19 |
+
"# Import libraries\n",
|
20 |
+
"import pandas as pd\n",
|
21 |
+
"import numpy as np\n",
|
22 |
+
"import matplotlib.pyplot as plt\n",
|
23 |
+
"import seaborn as sns\n",
|
24 |
+
"\n",
|
25 |
+
"# Set visualization style\n",
|
26 |
+
"sns.set(style='whitegrid')\n",
|
27 |
+
"%matplotlib inline"
|
28 |
+
]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"cell_type": "code",
|
32 |
+
"execution_count": null,
|
33 |
+
"metadata": {},
|
34 |
+
"id": 3,
|
35 |
+
"source": [
|
36 |
+
"# Load the dataset\n",
|
37 |
+
"df = pd.read_csv('housing_data.csv')\n",
|
38 |
+
"\n",
|
39 |
+
"# Display basic information\n",
|
40 |
+
"print(f\"Dataset shape: {df.shape}\")\n",
|
41 |
+
"df.head()"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": null,
|
47 |
+
"metadata": {},
|
48 |
+
"id": 4,
|
49 |
+
"source": [
|
50 |
+
"# Perform data cleaning\n",
|
51 |
+
"# Fill missing values with median\n",
|
52 |
+
"for column in df.columns:\n",
|
53 |
+
" if df[column].dtype in ['float64', 'int64']:\n",
|
54 |
+
" df[column].fillna(df[column].median(), inplace=True)\n",
|
55 |
+
" else:\n",
|
56 |
+
" df[column].fillna(df[column].mode()[0], inplace=True)\n",
|
57 |
+
"\n",
|
58 |
+
"# Check for remaining missing values\n",
|
59 |
+
"print(\"Missing values after cleaning:\")\n",
|
60 |
+
"print(df.isnull().sum())"
|
61 |
+
]
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"cell_type": "code",
|
65 |
+
"execution_count": null,
|
66 |
+
"metadata": {},
|
67 |
+
"id": 5,
|
68 |
+
"source": [
|
69 |
+
"# Exploratory data analysis\n",
|
70 |
+
"# Create correlation matrix\n",
|
71 |
+
"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
|
72 |
+
"correlation_matrix = df[numeric_columns].corr()\n",
|
73 |
+
"\n",
|
74 |
+
"# Plot heatmap\n",
|
75 |
+
"plt.figure(figsize=(12, 10))\n",
|
76 |
+
"sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)\n",
|
77 |
+
"plt.title('Correlation Matrix of Numeric Features', fontsize=18)\n",
|
78 |
+
"plt.xticks(rotation=45, ha='right')\n",
|
79 |
+
"plt.tight_layout()\n",
|
80 |
+
"plt.show()"
|
81 |
+
]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"cell_type": "code",
|
85 |
+
"execution_count": null,
|
86 |
+
"metadata": {},
|
87 |
+
"id": 6,
|
88 |
+
"source": [
|
89 |
+
"# Feature engineering\n",
|
90 |
+
"# Create new features\n",
|
91 |
+
"if 'bedrooms' in df.columns and 'total_rooms' in df.columns:\n",
|
92 |
+
" df['bedrooms_ratio'] = df['bedrooms'] / df['total_rooms']\n",
|
93 |
+
"\n",
|
94 |
+
"if 'total_rooms' in df.columns and 'households' in df.columns:\n",
|
95 |
+
" df['rooms_per_household'] = df['total_rooms'] / df['households']\n",
|
96 |
+
"\n",
|
97 |
+
"# Scale numeric features\n",
|
98 |
+
"from sklearn.preprocessing import StandardScaler\n",
|
99 |
+
"scaler = StandardScaler()\n",
|
100 |
+
"df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n",
|
101 |
+
"\n",
|
102 |
+
"# Display transformed data\n",
|
103 |
+
"df.head()"
|
104 |
+
]
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"cell_type": "code",
|
108 |
+
"execution_count": null,
|
109 |
+
"metadata": {},
|
110 |
+
"id": 7,
|
111 |
+
"source": [
|
112 |
+
"# Build a simple prediction model\n",
|
113 |
+
"from sklearn.model_selection import train_test_split\n",
|
114 |
+
"from sklearn.linear_model import LinearRegression\n",
|
115 |
+
"from sklearn.metrics import mean_squared_error, r2_score\n",
|
116 |
+
"\n",
|
117 |
+
"# Assume we're predicting median_house_value\n",
|
118 |
+
"if 'median_house_value' in df.columns:\n",
|
119 |
+
" # Prepare features and target\n",
|
120 |
+
" X = df.drop('median_house_value', axis=1)\n",
|
121 |
+
" y = df['median_house_value']\n",
|
122 |
+
" \n",
|
123 |
+
" # Split the data\n",
|
124 |
+
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
125 |
+
" \n",
|
126 |
+
" # Train the model\n",
|
127 |
+
" model = LinearRegression()\n",
|
128 |
+
" model.fit(X_train, y_train)\n",
|
129 |
+
" \n",
|
130 |
+
" # Make predictions\n",
|
131 |
+
" y_pred = model.predict(X_test)\n",
|
132 |
+
" \n",
|
133 |
+
" # Evaluate the model\n",
|
134 |
+
" mse = mean_squared_error(y_test, y_pred)\n",
|
135 |
+
" r2 = r2_score(y_test, y_pred)\n",
|
136 |
+
" \n",
|
137 |
+
" print(f\"Mean Squared Error: {mse:.2f}\")\n",
|
138 |
+
" print(f\"R² Score: {r2:.2f}\")\n",
|
139 |
+
" \n",
|
140 |
+
" # Plot actual vs predicted values\n",
|
141 |
+
" plt.figure(figsize=(10, 6))\n",
|
142 |
+
" plt.scatter(y_test, y_pred, alpha=0.5)\n",
|
143 |
+
" plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')\n",
|
144 |
+
" plt.xlabel('Actual Values')\n",
|
145 |
+
" plt.ylabel('Predicted Values')\n",
|
146 |
+
" plt.title('Actual vs Predicted Values')\n",
|
147 |
+
" plt.tight_layout()\n",
|
148 |
+
" plt.show()"
|
149 |
+
]
|
150 |
+
}
|
151 |
+
],
|
152 |
+
"metadata": {
|
153 |
+
"kernelspec": {
|
154 |
+
"display_name": "Python 3",
|
155 |
+
"language": "python",
|
156 |
+
"name": "python3"
|
157 |
+
},
|
158 |
+
"language_info": {
|
159 |
+
"codemirror_mode": {
|
160 |
+
"name": "ipython",
|
161 |
+
"version": 3
|
162 |
+
},
|
163 |
+
"file_extension": ".py",
|
164 |
+
"mimetype": "text/x-python",
|
165 |
+
"name": "python",
|
166 |
+
"nbconvert_exporter": "python",
|
167 |
+
"pygments_lexer": "ipython3",
|
168 |
+
"version": "3.8.10"
|
169 |
+
}
|
170 |
+
},
|
171 |
+
"nbformat": 4,
|
172 |
+
"nbformat_minor": 4
|
173 |
+
}
|
notebook_enhancer.py
CHANGED
@@ -1,90 +1,140 @@
|
|
1 |
import nbformat
|
|
|
2 |
import gradio as gr
|
3 |
from transformers import pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
class NotebookEnhancer:
|
7 |
def __init__(self):
|
8 |
-
|
9 |
-
self.
|
10 |
-
|
11 |
-
)
|
12 |
-
self.
|
13 |
-
"summarization",
|
|
|
|
|
|
|
14 |
)
|
|
|
15 |
|
16 |
def generate_title(self, code):
|
17 |
"""Generate a concise title for a code cell"""
|
18 |
# Limit input length to match model constraints
|
19 |
-
max_length =
|
|
|
|
|
20 |
truncated_code = code[:max_length] if len(code) > max_length else code
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
truncated_code, max_length=10, min_length=3, do_sample=False
|
24 |
-
)
|
25 |
-
title = result[0]["summary_text"].strip()
|
26 |
# Format as a markdown title
|
27 |
-
return f"
|
|
|
|
|
|
|
|
|
28 |
|
29 |
def generate_summary(self, code):
|
30 |
"""Generate a detailed summary for a code cell"""
|
31 |
-
#
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
)
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
41 |
"""Add title and summary markdown cells before each code cell"""
|
42 |
-
# Load the notebook
|
43 |
-
notebook = nbformat.reads(notebook_content, as_version=4)
|
44 |
-
|
45 |
# Create a new notebook
|
46 |
enhanced_notebook = nbformat.v4.new_notebook()
|
47 |
enhanced_notebook.metadata = notebook.metadata
|
48 |
-
|
49 |
# Process each cell
|
50 |
i = 0
|
|
|
51 |
while i < len(notebook.cells):
|
52 |
cell = notebook.cells[i]
|
53 |
-
|
54 |
# For code cells, add title and summary markdown cells
|
55 |
if cell.cell_type == "code" and cell.source.strip():
|
56 |
-
# Generate title
|
57 |
-
title = self.generate_title(cell.source)
|
58 |
-
title_cell = nbformat.v4.new_markdown_cell(title)
|
59 |
-
enhanced_notebook.cells.append(title_cell)
|
60 |
-
|
61 |
# Generate summary
|
62 |
summary = self.generate_summary(cell.source)
|
63 |
summary_cell = nbformat.v4.new_markdown_cell(summary)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
enhanced_notebook.cells.append(summary_cell)
|
65 |
|
66 |
# Add the original cell
|
|
|
67 |
enhanced_notebook.cells.append(cell)
|
68 |
i += 1
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
"""Process an uploaded notebook file"""
|
76 |
enhancer = NotebookEnhancer()
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
# Process the notebook
|
82 |
-
enhanced_notebook = enhancer.enhance_notebook(
|
83 |
-
|
|
|
84 |
# Save to temp file
|
85 |
output_path = "enhanced_notebook.ipynb"
|
86 |
with open(output_path, "w", encoding="utf-8") as f:
|
87 |
-
f.write(
|
88 |
|
89 |
return output_path
|
90 |
|
@@ -118,5 +168,7 @@ def build_gradio_interface():
|
|
118 |
|
119 |
# This will be the entry point when running the script
|
120 |
if __name__ == "__main__":
|
121 |
-
|
122 |
-
|
|
|
|
|
|
1 |
import nbformat
|
2 |
+
import spacy
|
3 |
import gradio as gr
|
4 |
from transformers import pipeline
|
5 |
+
from tokenize import tokenize
|
6 |
+
from transformers import (
|
7 |
+
AutoModelForSeq2SeqLM,
|
8 |
+
AutoTokenizer,
|
9 |
+
AutoConfig,
|
10 |
+
pipeline,
|
11 |
+
SummarizationPipeline,
|
12 |
+
)
|
13 |
+
import re
|
14 |
+
|
15 |
+
MODEL_NAME = "sagard21/python-code-explainer"
|
16 |
|
17 |
|
18 |
class NotebookEnhancer:
|
19 |
def __init__(self):
|
20 |
+
self.config = AutoConfig.from_pretrained(MODEL_NAME)
|
21 |
+
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding=True)
|
22 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
23 |
+
self.model.eval()
|
24 |
+
self.pipeline = pipeline(
|
25 |
+
"summarization",
|
26 |
+
model=MODEL_NAME,
|
27 |
+
config=self.config,
|
28 |
+
tokenizer=self.tokenizer,
|
29 |
)
|
30 |
+
self.nlp = spacy.load("en_core_web_sm")
|
31 |
|
32 |
def generate_title(self, code):
|
33 |
"""Generate a concise title for a code cell"""
|
34 |
# Limit input length to match model constraints
|
35 |
+
max_length = len(code) // 2
|
36 |
+
print("Title Max length", max_length)
|
37 |
+
|
38 |
truncated_code = code[:max_length] if len(code) > max_length else code
|
39 |
+
max_length = len(truncated_code) // 2
|
40 |
+
title = self.pipeline(code, min_length=5, max_length=30)[0][
|
41 |
+
"summary_text"
|
42 |
+
].strip()
|
43 |
|
44 |
+
print("Result title", title)
|
|
|
|
|
|
|
45 |
# Format as a markdown title
|
46 |
+
return f"# {title.capitalize()}"
|
47 |
+
|
48 |
+
def _count_num_words(self, code):
|
49 |
+
words = code.split(" ")
|
50 |
+
return len(words)
|
51 |
|
52 |
def generate_summary(self, code):
|
53 |
"""Generate a detailed summary for a code cell"""
|
54 |
+
# result = self.pipeline([code], min_length=3, max_length=len(code // 2))
|
55 |
+
print("Code", code)
|
56 |
+
result = self.pipeline(code, min_length=5, max_length=30)
|
57 |
+
print(result)
|
58 |
+
summary = result[0]["summary_text"].strip()
|
59 |
+
summary = self._postprocess_summary(summary)
|
60 |
+
print("Result summary", summary)
|
61 |
+
# print(self._is_valid_sentence_nlp(summary))
|
62 |
+
# summary = result[0]["summary_text"].strip()
|
63 |
+
return f"{summary}"
|
64 |
+
|
65 |
+
def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
|
66 |
"""Add title and summary markdown cells before each code cell"""
|
|
|
|
|
|
|
67 |
# Create a new notebook
|
68 |
enhanced_notebook = nbformat.v4.new_notebook()
|
69 |
enhanced_notebook.metadata = notebook.metadata
|
70 |
+
print(len(notebook.cells))
|
71 |
# Process each cell
|
72 |
i = 0
|
73 |
+
id = len(notebook.cells) + 1
|
74 |
while i < len(notebook.cells):
|
75 |
cell = notebook.cells[i]
|
|
|
76 |
# For code cells, add title and summary markdown cells
|
77 |
if cell.cell_type == "code" and cell.source.strip():
|
|
|
|
|
|
|
|
|
|
|
78 |
# Generate summary
|
79 |
summary = self.generate_summary(cell.source)
|
80 |
summary_cell = nbformat.v4.new_markdown_cell(summary)
|
81 |
+
summary_cell.outputs = []
|
82 |
+
summary_cell.id = id
|
83 |
+
id += 1
|
84 |
+
|
85 |
+
# Generate title based on the summary cell
|
86 |
+
title = self.generate_title(summary)
|
87 |
+
title_cell = nbformat.v4.new_markdown_cell(title)
|
88 |
+
title_cell.outputs = []
|
89 |
+
title_cell.id = id
|
90 |
+
id += 1
|
91 |
+
|
92 |
+
enhanced_notebook.cells.append(title_cell)
|
93 |
enhanced_notebook.cells.append(summary_cell)
|
94 |
|
95 |
# Add the original cell
|
96 |
+
cell.outputs = []
|
97 |
enhanced_notebook.cells.append(cell)
|
98 |
i += 1
|
99 |
+
return enhanced_notebook
|
100 |
+
|
101 |
+
def is_valid(self, words: list[str]):
|
102 |
+
has_noun = False
|
103 |
+
has_verb = False
|
104 |
+
for word in words:
|
105 |
+
if word.pos_ in ["NOUN", "PROPN", "PRON"]:
|
106 |
+
has_noun = True
|
107 |
+
if word.pos_ == "VERB":
|
108 |
+
has_verb = True
|
109 |
+
return has_noun and has_verb
|
110 |
+
|
111 |
+
def _postprocess_summary(self, summary: str):
|
112 |
+
doc = self.nlp(summary)
|
113 |
+
sentences = list(doc.sents)
|
114 |
+
# ignore the first sentence
|
115 |
+
sentences = sentences[1:]
|
116 |
+
# remove the trailing list enumeration
|
117 |
+
postprocessed_sentences = []
|
118 |
+
for sentence in sentences:
|
119 |
+
if self.is_valid(sentence):
|
120 |
+
postprocessed_sentences.append(sentence.text)
|
121 |
+
return " ".join(postprocessed_sentences)
|
122 |
+
|
123 |
+
|
124 |
+
def process_notebook(file_path):
|
125 |
"""Process an uploaded notebook file"""
|
126 |
enhancer = NotebookEnhancer()
|
127 |
+
nb = None
|
128 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
129 |
+
nb = nbformat.read(f, as_version=4)
|
|
|
130 |
# Process the notebook
|
131 |
+
enhanced_notebook = enhancer.enhance_notebook(nb)
|
132 |
+
print(enhanced_notebook)
|
133 |
+
enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
|
134 |
# Save to temp file
|
135 |
output_path = "enhanced_notebook.ipynb"
|
136 |
with open(output_path, "w", encoding="utf-8") as f:
|
137 |
+
f.write(enhanced_notebook_str)
|
138 |
|
139 |
return output_path
|
140 |
|
|
|
168 |
|
169 |
# This will be the entry point when running the script
|
170 |
if __name__ == "__main__":
|
171 |
+
file_input = "my_notebook.json"
|
172 |
+
test = process_notebook(file_input)
|
173 |
+
# demo = build_gradio_interface()
|
174 |
+
# demo.launch()
|
requirements.txt
CHANGED
@@ -1,29 +1,178 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==1.6.0
|
2 |
+
aiofiles==24.1.0
|
3 |
+
annotated-types==0.7.0
|
4 |
+
anyio==4.9.0
|
5 |
+
appnope==0.1.4
|
6 |
+
argon2-cffi==23.1.0
|
7 |
+
argon2-cffi-bindings==21.2.0
|
8 |
+
arrow==1.3.0
|
9 |
+
asttokens==3.0.0
|
10 |
+
async-lru==2.0.5
|
11 |
+
attrs==25.3.0
|
12 |
+
babel==2.17.0
|
13 |
+
beautifulsoup4==4.13.4
|
14 |
+
black==25.1.0
|
15 |
+
bleach==6.2.0
|
16 |
+
blis==1.3.0
|
17 |
+
catalogue==2.0.10
|
18 |
+
certifi==2025.1.31
|
19 |
+
cffi==1.17.1
|
20 |
+
charset-normalizer==3.4.1
|
21 |
+
click==8.1.8
|
22 |
+
cloudpathlib==0.21.0
|
23 |
+
comm==0.2.2
|
24 |
+
confection==0.1.5
|
25 |
+
contourpy==1.3.2
|
26 |
+
cycler==0.12.1
|
27 |
+
cymem==2.0.11
|
28 |
+
debugpy==1.8.14
|
29 |
+
decorator==5.2.1
|
30 |
+
defusedxml==0.7.1
|
31 |
+
executing==2.2.0
|
32 |
+
fastapi==0.115.12
|
33 |
+
fastjsonschema==2.21.1
|
34 |
+
ffmpy==0.5.0
|
35 |
+
filelock==3.18.0
|
36 |
+
flake8==7.2.0
|
37 |
+
fonttools==4.57.0
|
38 |
+
fqdn==1.5.1
|
39 |
+
fsspec==2025.3.2
|
40 |
+
gradio==5.25.2
|
41 |
+
gradio_client==1.8.0
|
42 |
+
groovy==0.1.2
|
43 |
+
h11==0.14.0
|
44 |
+
httpcore==1.0.8
|
45 |
+
httpx==0.28.1
|
46 |
+
huggingface-hub==0.30.2
|
47 |
+
idna==3.10
|
48 |
+
iniconfig==2.1.0
|
49 |
+
ipykernel==6.29.5
|
50 |
+
ipython==9.1.0
|
51 |
+
ipython_pygments_lexers==1.1.1
|
52 |
+
isoduration==20.11.0
|
53 |
+
isort==6.0.1
|
54 |
+
jedi==0.19.2
|
55 |
+
Jinja2==3.1.6
|
56 |
+
joblib==1.4.2
|
57 |
+
json5==0.12.0
|
58 |
+
jsonpointer==3.0.0
|
59 |
+
jsonschema==4.23.0
|
60 |
+
jsonschema-specifications==2024.10.1
|
61 |
+
jupyter-events==0.12.0
|
62 |
+
jupyter-lsp==2.2.5
|
63 |
+
jupyter_client==8.6.3
|
64 |
+
jupyter_core==5.7.2
|
65 |
+
jupyter_server==2.15.0
|
66 |
+
jupyter_server_terminals==0.5.3
|
67 |
+
jupyterlab==4.4.0
|
68 |
+
jupyterlab_pygments==0.3.0
|
69 |
+
jupyterlab_server==2.27.3
|
70 |
+
kiwisolver==1.4.8
|
71 |
+
langcodes==3.5.0
|
72 |
+
language_data==1.3.0
|
73 |
+
marisa-trie==1.2.1
|
74 |
+
markdown-it-py==3.0.0
|
75 |
+
MarkupSafe==3.0.2
|
76 |
+
matplotlib==3.10.1
|
77 |
+
matplotlib-inline==0.1.7
|
78 |
+
mccabe==0.7.0
|
79 |
+
mdurl==0.1.2
|
80 |
+
mistune==3.1.3
|
81 |
+
mpmath==1.3.0
|
82 |
+
murmurhash==1.0.12
|
83 |
+
mypy-extensions==1.0.0
|
84 |
+
nbclient==0.10.2
|
85 |
+
nbconvert==7.16.6
|
86 |
+
nbformat==5.10.4
|
87 |
+
nest-asyncio==1.6.0
|
88 |
+
networkx==3.4.2
|
89 |
+
notebook_shim==0.2.4
|
90 |
+
numpy==2.2.4
|
91 |
+
orjson==3.10.16
|
92 |
+
overrides==7.7.0
|
93 |
+
packaging==24.2
|
94 |
+
pandas==2.2.3
|
95 |
+
pandocfilters==1.5.1
|
96 |
+
parso==0.8.4
|
97 |
+
pathspec==0.12.1
|
98 |
+
pexpect==4.9.0
|
99 |
+
pillow==11.2.1
|
100 |
+
platformdirs==4.3.7
|
101 |
+
pluggy==1.5.0
|
102 |
+
preshed==3.0.9
|
103 |
+
prometheus_client==0.21.1
|
104 |
+
prompt_toolkit==3.0.51
|
105 |
+
protobuf==6.30.2
|
106 |
+
psutil==7.0.0
|
107 |
+
ptyprocess==0.7.0
|
108 |
+
pure_eval==0.2.3
|
109 |
+
pycodestyle==2.13.0
|
110 |
+
pycparser==2.22
|
111 |
+
pydantic==2.11.3
|
112 |
+
pydantic_core==2.33.1
|
113 |
+
pydub==0.25.1
|
114 |
+
pyflakes==3.3.2
|
115 |
+
Pygments==2.19.1
|
116 |
+
pyparsing==3.2.3
|
117 |
+
pytest==8.3.5
|
118 |
+
python-dateutil==2.9.0.post0
|
119 |
+
python-json-logger==3.3.0
|
120 |
+
python-multipart==0.0.20
|
121 |
+
pytz==2025.2
|
122 |
+
PyYAML==6.0.2
|
123 |
+
pyzmq==26.4.0
|
124 |
+
referencing==0.36.2
|
125 |
+
regex==2024.11.6
|
126 |
+
requests==2.32.3
|
127 |
+
rfc3339-validator==0.1.4
|
128 |
+
rfc3986-validator==0.1.1
|
129 |
+
rich==14.0.0
|
130 |
+
rpds-py==0.24.0
|
131 |
+
ruff==0.11.5
|
132 |
+
safehttpx==0.1.6
|
133 |
+
safetensors==0.5.3
|
134 |
+
scikit-learn==1.6.1
|
135 |
+
scipy==1.15.2
|
136 |
+
seaborn==0.13.2
|
137 |
+
semantic-version==2.10.0
|
138 |
+
Send2Trash==1.8.3
|
139 |
+
sentencepiece==0.2.0
|
140 |
+
shellingham==1.5.4
|
141 |
+
six==1.17.0
|
142 |
+
smart-open==7.1.0
|
143 |
+
sniffio==1.3.1
|
144 |
+
soupsieve==2.6
|
145 |
+
spacy==3.8.5
|
146 |
+
spacy-legacy==3.0.12
|
147 |
+
spacy-loggers==1.0.5
|
148 |
+
srsly==2.5.1
|
149 |
+
stack-data==0.6.3
|
150 |
+
starlette==0.46.2
|
151 |
+
sympy==1.13.1
|
152 |
+
terminado==0.18.1
|
153 |
+
thinc==8.3.6
|
154 |
+
threadpoolctl==3.6.0
|
155 |
+
tinycss2==1.4.0
|
156 |
+
tokenizers==0.21.1
|
157 |
+
tomlkit==0.13.2
|
158 |
+
torch==2.6.0
|
159 |
+
tornado==6.4.2
|
160 |
+
tqdm==4.67.1
|
161 |
+
traitlets==5.14.3
|
162 |
+
transformers==4.51.3
|
163 |
+
typer==0.15.2
|
164 |
+
types-python-dateutil==2.9.0.20241206
|
165 |
+
typing-inspection==0.4.0
|
166 |
+
typing_extensions==4.13.2
|
167 |
+
tzdata==2025.2
|
168 |
+
uri-template==1.3.0
|
169 |
+
urllib3==2.4.0
|
170 |
+
uvicorn==0.34.1
|
171 |
+
wasabi==1.1.3
|
172 |
+
wcwidth==0.2.13
|
173 |
+
weasel==0.4.1
|
174 |
+
webcolors==24.11.1
|
175 |
+
webencodings==0.5.1
|
176 |
+
websocket-client==1.8.0
|
177 |
+
websockets==15.0.1
|
178 |
+
wrapt==1.17.2
|
test.ipynb
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"from tokenize import tokenize\n",
|
10 |
+
"from io import BytesIO\n",
|
11 |
+
"\n",
|
12 |
+
"code = \"\"\"import nltk\n",
|
13 |
+
" from nltk.stem import PorterStemmer\n",
|
14 |
+
" porter_stemmer=PorterStemmer()\n",
|
15 |
+
" words=[\"connect\",\"connected\",\"connection\",\"connections\",\"connects\"]\n",
|
16 |
+
" stemmed_words=[porter_stemmer.stem(word) for word in words]\n",
|
17 |
+
" stemmed_words\"\"\"\n",
|
18 |
+
" \n",
|
19 |
+
"for tok in tokenize(BytesIO(code.encode('utf-8')).readline):\n",
|
20 |
+
" print(f\"Type: {tok.type}\\nString: {tok.string}\\nStart: {tok.start}\\nEnd: {tok.end}\\nLine: {tok.line.strip()}\\n======\\n\")\n"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 40,
|
26 |
+
"metadata": {},
|
27 |
+
"outputs": [
|
28 |
+
{
|
29 |
+
"name": "stdout",
|
30 |
+
"output_type": "stream",
|
31 |
+
"text": [
|
32 |
+
"['Create a function to summarize the data.', 'For each column in the dataframe, create a correlation matrix.', '3']\n"
|
33 |
+
]
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"source": [
|
37 |
+
"import re\n",
|
38 |
+
"my_summary = '\\n1. Create a function to summarize the code.\\n2. At first, we will start by importing the pandas and numpy modules.'.strip()\n",
|
39 |
+
"my_summary = 'Create a function summarize and load the dataset.\\n1. To Load the dataset\\n2. To display the basic information\\n3.'.strip()\n",
|
40 |
+
"my_summary = '\\n1. Create a function to summarize the data.\\n2. For each column in the dataframe, create a correlation matrix.\\n3'\n",
|
41 |
+
"my_symmary = \"\\n1. Create a function to summarize the code.\\n2. At first, we will start by importing the pandas and numpy modules.\"\n",
|
42 |
+
"sentences = my_summary.split('\\n')[1:]\n",
|
43 |
+
"#remove the trailing list enumeration\n",
|
44 |
+
"new_sentences = []\n",
|
45 |
+
"for sentence in sentences:\n",
|
46 |
+
" new_sentences.append(re.sub(\"[0-9]+\\.\\s\", \"\", sentence))\n",
|
47 |
+
"print(new_sentences)"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": 42,
|
53 |
+
"metadata": {},
|
54 |
+
"outputs": [
|
55 |
+
{
|
56 |
+
"name": "stdout",
|
57 |
+
"output_type": "stream",
|
58 |
+
"text": [
|
59 |
+
"\n",
|
60 |
+
"1. Create a function to summarize the data.\n",
|
61 |
+
"2.\n",
|
62 |
+
"the sentence is valid? True\n",
|
63 |
+
"\n",
|
64 |
+
" False SPACE\n",
|
65 |
+
"1 False X\n",
|
66 |
+
". False PUNCT\n",
|
67 |
+
"Create True VERB\n",
|
68 |
+
"a True DET\n",
|
69 |
+
"function True NOUN\n",
|
70 |
+
"to True PART\n",
|
71 |
+
"summarize True VERB\n",
|
72 |
+
"the True DET\n",
|
73 |
+
"data True NOUN\n",
|
74 |
+
". False PUNCT\n",
|
75 |
+
"\n",
|
76 |
+
" False SPACE\n",
|
77 |
+
"2 False X\n",
|
78 |
+
". False PUNCT\n",
|
79 |
+
"For each column in the dataframe, create a correlation matrix.\n",
|
80 |
+
"\n",
|
81 |
+
"the sentence is valid? True\n",
|
82 |
+
"For True ADP\n",
|
83 |
+
"each True DET\n",
|
84 |
+
"column True NOUN\n",
|
85 |
+
"in True ADP\n",
|
86 |
+
"the True DET\n",
|
87 |
+
"dataframe True NOUN\n",
|
88 |
+
", False PUNCT\n",
|
89 |
+
"create True VERB\n",
|
90 |
+
"a True DET\n",
|
91 |
+
"correlation True NOUN\n",
|
92 |
+
"matrix True NOUN\n",
|
93 |
+
". False PUNCT\n",
|
94 |
+
"\n",
|
95 |
+
" False SPACE\n",
|
96 |
+
"3\n",
|
97 |
+
"the sentence is valid? False\n",
|
98 |
+
"3 False NUM\n"
|
99 |
+
]
|
100 |
+
}
|
101 |
+
],
|
102 |
+
"source": [
|
103 |
+
"import spacy\n",
|
104 |
+
"nlp = spacy.load(\"en_core_web_sm\")\n",
|
105 |
+
"\n",
|
106 |
+
"\n",
|
107 |
+
"def is_valid(words: list[str]):\n",
|
108 |
+
" has_noun = False\n",
|
109 |
+
" has_verb = False\n",
|
110 |
+
" for word in words: \n",
|
111 |
+
" if word.pos_ in ['NOUN', 'PROPN', 'PRON']:\n",
|
112 |
+
" has_noun = True\n",
|
113 |
+
" if word.pos_ == 'VERB':\n",
|
114 |
+
" has_verb = True\n",
|
115 |
+
" return has_noun and has_verb\n",
|
116 |
+
"\n",
|
117 |
+
"doc = nlp(my_summary)\n",
|
118 |
+
"sentences = list(doc.sents)\n",
|
119 |
+
"\n",
|
120 |
+
"for sentence in sentences:\n",
|
121 |
+
" print(sentence)\n",
|
122 |
+
" print(\"the sentence is valid?\", is_valid(sentence))\n",
|
123 |
+
" for word in sentence:\n",
|
124 |
+
" print(word, word.is_alpha, word.pos_)\n"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"cell_type": "code",
|
129 |
+
"execution_count": null,
|
130 |
+
"metadata": {},
|
131 |
+
"outputs": [],
|
132 |
+
"source": []
|
133 |
+
}
|
134 |
+
],
|
135 |
+
"metadata": {
|
136 |
+
"kernelspec": {
|
137 |
+
"display_name": ".venv",
|
138 |
+
"language": "python",
|
139 |
+
"name": "python3"
|
140 |
+
},
|
141 |
+
"language_info": {
|
142 |
+
"codemirror_mode": {
|
143 |
+
"name": "ipython",
|
144 |
+
"version": 3
|
145 |
+
},
|
146 |
+
"file_extension": ".py",
|
147 |
+
"mimetype": "text/x-python",
|
148 |
+
"name": "python",
|
149 |
+
"nbconvert_exporter": "python",
|
150 |
+
"pygments_lexer": "ipython3",
|
151 |
+
"version": "3.11.2"
|
152 |
+
}
|
153 |
+
},
|
154 |
+
"nbformat": 4,
|
155 |
+
"nbformat_minor": 2
|
156 |
+
}
|
test.json
ADDED
File without changes
|