bupa1018 commited on
Commit
c234df5
·
1 Parent(s): 50dd7e5

Delete chunking.py

Browse files
Files changed (1) hide show
  1. chunking.py +0 -303
chunking.py DELETED
@@ -1,303 +0,0 @@
1
- import ast
2
- from langchain.schema import Document
3
-
4
- def chunk_pythoncode_and_add_metadata(code_files_content, code_files_path):
5
- chunks = []
6
- for code_file_content, code_file_path in zip(code_files_content, code_files_path):
7
- print("-----------------------------------------------------", code_file_content)
8
- """
9
- Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
10
- aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
11
- e.g visbility: public, _internal
12
- type: "class", "methods", "command"(CLI commands)
13
- source:
14
-
15
-
16
- with the intend to use a filter when retrieving potentaion useful snippets.
17
-
18
-
19
-
20
- """
21
- document_chunks = generate_code_chunks_with_metadata(code_file_content, code_file_path)
22
- chunks.extend(document_chunks)
23
- return chunks
24
-
25
-
26
- # Split text into chunks
27
- def chunk_text_and_add_metadata(texts, references, chunk_size, chunk_overlap):
28
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
29
- chunks = []
30
-
31
- for text, reference in zip(texts, references):
32
- chunks.extend([
33
- Document(
34
- page_content=chunk,
35
- metadata={
36
- "source": reference,
37
- "directory": "doc/"
38
- }
39
- )
40
- for chunk in text_splitter.split_text(text)
41
- ])
42
- return chunks
43
-
44
-
45
- def generate_code_chunks_with_metadata(code_file_content, code_file_path):
46
- """
47
- Custom Python Code Splitter
48
- chunks python file by length of func/method body
49
- aims to have one full method/function in a chunk and full body of a class, but cutting of when first method declaration is met
50
-
51
- able to handles decorators on methods
52
-
53
- Entry point method to process the Python file.
54
- It invokes the iterate_ast function.
55
- """
56
- documents = []
57
- #print(f"Processing file: {file_path}")
58
-
59
- _iterate_ast(code_file_content, documents, code_file_path)
60
- # Determine usage based on the file_path
61
- if code_file_path.startswith("kadi_apy"):
62
- directory = "kadi_apy/"
63
- if code_file_path.startswith("kadi_apy/lib/"):
64
- usage = "kadi_apy/lib/"
65
- elif code_file_path.startswith("kadi_apy/cli/"):
66
- usage = "kadi_apy/cli/"
67
- else:
68
- usage = "kadi_apy/top_level_file.py"
69
- else:
70
- directory = "undefined"
71
- usage = "undefined"
72
-
73
- # Add metadata-type "usage" to all documents
74
- for doc in documents:
75
- doc.metadata["source"] = code_file_path
76
- doc.metadata["directory"] = directory
77
- doc.metadata["usage"] = usage # Add the determined usage metadata
78
- #print(doc)
79
- return documents
80
-
81
-
82
- def _iterate_ast(code_file_content, documents, code_file_path):
83
- """
84
- Parses the AST of the given Python file and delegates
85
- handling to specific methods based on node types.
86
- """
87
- tree = ast.parse(code_file_content, filename=code_file_path)
88
-
89
- first_level_nodes = list(ast.iter_child_nodes(tree))
90
-
91
- # Check if there are no first-level nodes
92
- if not first_level_nodes:
93
- documents.extend(
94
- _chunk_nodeless_code_file_content(code_file_content, code_file_path))
95
- return
96
-
97
- all_imports = all(isinstance(node, (ast.Import, ast.ImportFrom)) for node in first_level_nodes)
98
- if all_imports:
99
- documents.extend(
100
- _chunk_import_only_code_file_content(code_file_content, code_file_path))
101
-
102
- # Iterate over first-level nodes
103
- for first_level_node in ast.iter_child_nodes(tree):
104
-
105
- if isinstance(first_level_node, ast.ClassDef):
106
- documents.extend(
107
- _handle_first_level_class(first_level_node, code_file_content))
108
- elif isinstance(first_level_node, ast.FunctionDef):
109
- documents.extend(
110
- _chunk_first_level_func_node(first_level_node, code_file_content))
111
- elif isinstance(first_level_node, ast.Assign):
112
- documents.extend(
113
- _chunk_first_level_assign_node(first_level_node, code_file_content))
114
- # else:
115
- # documents.extend(
116
- # _handle_not_defined_case(code_file_content))
117
-
118
-
119
-
120
- def _handle_first_level_class(ast_node , code_file_content):
121
- """
122
- Handles classes at the first level of the AST.
123
- """
124
- documents = []
125
- class_start_line = ast_node.lineno
126
- class_body_lines = [child.lineno for child in ast_node.body if isinstance(child, ast.FunctionDef)]
127
- class_end_line = min(class_body_lines, default=ast_node.end_lineno) - 1
128
- class_source = '\n'.join(code_file_content.splitlines()[class_start_line-1:class_end_line])
129
-
130
- metadata = {
131
- "type": "class",
132
- "class": ast_node.name,
133
- "visibility": "public"
134
- }
135
-
136
- # Create and store Document for the class
137
- doc = Document(
138
- page_content=class_source,
139
- metadata=metadata
140
- )
141
- documents.append(doc)
142
-
143
- # Handle methods within the class
144
- for second_level_node in ast.iter_child_nodes(ast_node):
145
- if isinstance(second_level_node, ast.FunctionDef):
146
- method_start_line = (
147
- second_level_node.decorator_list[0].lineno
148
- if second_level_node.decorator_list else second_level_node.lineno
149
- )
150
- method_end_line = second_level_node.end_lineno
151
- method_source = '\n'.join(code_file_content.splitlines()[method_start_line-1:method_end_line])
152
-
153
- visibility = "internal" if second_level_node.name.startswith("_") else "public"
154
-
155
- doc = Document(
156
- page_content=method_source,
157
- metadata={
158
- "type": "method",
159
- "method": second_level_node.name,
160
- "visibility": visibility,
161
- "class": ast_node.name
162
- }
163
- )
164
- documents.append(doc)
165
-
166
- return documents
167
-
168
-
169
- def _handle_not_defined_case(code_file_content):
170
- documents = []
171
- documents.extend(
172
- _chunk_code_file_content_by_character(code_file_content))
173
- return documents
174
-
175
-
176
- def _chunk_first_level_func_node(ast_node, code_file_content):
177
- """
178
- Handles functions at the first level of the AST.
179
- """
180
- documents = []
181
- function_start_line = (
182
- ast_node.decorator_list[0].lineno
183
- if ast_node.decorator_list else ast_node.lineno
184
- )
185
- function_end_line = ast_node.end_lineno
186
- function_source = '\n'.join(code_file_content.splitlines()[function_start_line-1:function_end_line])
187
-
188
- visibility = "internal" if ast_node.name.startswith("_") else "public"
189
-
190
- is_command = any(
191
- decorator.id == "apy_command"
192
- for decorator in ast_node.decorator_list
193
- if hasattr(decorator, "id")
194
- )
195
-
196
- metadata = {
197
- "type": "command" if is_command else "function",
198
- "visibility": visibility
199
- }
200
- if is_command:
201
- metadata["command"] = ast_node.name
202
- else:
203
- metadata["method"] = ast_node.name
204
-
205
- doc = Document(
206
- page_content=function_source,
207
- metadata=metadata
208
- )
209
- documents.append(doc)
210
-
211
- return documents
212
-
213
-
214
-
215
- def _chunk_first_level_assign_node(ast_node, code_file_content):
216
-
217
- """
218
- Handles assignment statements at the first level of the AST.
219
- """
220
- documents = []
221
- assign_start_line = ast_node.lineno
222
- assign_end_line = ast_node.end_lineno
223
- assign_source = '\n'.join(code_file_content.splitlines()[assign_start_line-1:assign_end_line])
224
-
225
- # Create metadata without imports
226
- metadata = {"type": "Assign"}
227
-
228
- # Create and store Document for the assignment
229
- doc = Document(
230
- page_content=assign_source,
231
- metadata=metadata
232
- )
233
- documents.append(doc)
234
-
235
- return documents
236
-
237
-
238
-
239
- def _chunk_import_only_code_file_content(code_file_content, code_file_path):
240
- """
241
- Handles cases where the first-level nodes are only imports.
242
- """
243
- documents = []
244
- if code_file_path.endswith("__init__.py"):
245
- type = "__init__-file"
246
- else:
247
- type = "undefined"
248
-
249
- # Create metadata without imports
250
- metadata = {"type": type}
251
-
252
- # Create and store a Document with the full source code
253
- doc = Document(
254
- page_content=code_file_content,
255
- metadata=metadata
256
- )
257
- documents.append(doc)
258
- return documents
259
-
260
- def _chunk_nodeless_code_file_content(code_file_content, code_file_path):
261
- """
262
- Handles cases where no top-level nodes are found in the AST.
263
- """
264
- documents = []
265
- if code_file_path.endswith("__init__.py"):
266
- type = "__init__-file"
267
- else:
268
- type = "undefined"
269
-
270
- # Create metadata without imports
271
- metadata = {"type": type}
272
-
273
- # Create and store a Document with the full source code
274
- doc = Document(
275
- page_content=code_file_content,
276
- metadata=metadata
277
- )
278
- documents.append(doc)
279
-
280
- return documents
281
-
282
-
283
-
284
- from langchain.text_splitter import RecursiveCharacterTextSplitter
285
-
286
-
287
- def _chunk_code_file_content_by_character(code_file_content):
288
- documents = []
289
- text_splitter = RecursiveCharacterTextSplitter(
290
- chunk_size=512,
291
- chunk_overlap=128,
292
- separators=[]
293
- )
294
-
295
- chunks = text_splitter.split_text(code_file_content)
296
-
297
- for chunk in chunks:
298
- doc = Document(
299
- page_content=chunk
300
- )
301
- documents.append(doc)
302
-
303
- return documents