bupa1018 commited on
Commit
781ec79
·
1 Parent(s): a490952

Create process_repo_zipfile.py

Browse files
Files changed (1) hide show
  1. process_repo_zipfile.py +67 -0
process_repo_zipfile.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def extract_files_and_filepath_from_dir(directory, folder_paths, file_paths):
2
+ all_texts = []
3
+ file_references = []
4
+
5
+ zip_filename = next((file for file in os.listdir(directory) if file.endswith('.zip')), None)
6
+ zip_file_path = os.path.join(directory, zip_filename)
7
+
8
+ with tempfile.TemporaryDirectory() as tmpdirname:
9
+ # Unzip the file into the temporary directory
10
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
11
+ zip_ref.extractall(tmpdirname)
12
+
13
+ files = []
14
+ print("tmpdirname: " , tmpdirname)
15
+ unzipped_root = os.listdir(tmpdirname)
16
+ print("unzipped_root ", unzipped_root)
17
+
18
+ tmpsubdirpath= os.path.join(tmpdirname, unzipped_root[0])
19
+ print("tempsubdirpath: ", tmpsubdirpath)
20
+
21
+ if folder_paths:
22
+ for folder_path in folder_paths:
23
+ files += _get_all_files_in_folder(tmpsubdirpath, folder_path)
24
+ if file_paths:
25
+ files += [_get_file(tmpsubdirpath, file_path) for file_path in file_paths]
26
+
27
+
28
+ print(f"Total number of files: {len(files)}")
29
+
30
+ for file_path in files:
31
+ # print("111111111:", file_path)
32
+ file_ext = os.path.splitext(file_path)[1]
33
+ # print("222222222:", file_ext)
34
+ if os.path.getsize(file_path) == 0:
35
+ print(f"Skipping an empty file: {file_path}")
36
+ continue
37
+
38
+ with open(file_path, 'rb') as f:
39
+ if file_ext in ['.rst', '.py']:
40
+ text = f.read().decode('utf-8')
41
+
42
+ all_texts.append(text)
43
+ print("Filepaths brother:", file_path)
44
+ relative_path = os.path.relpath(file_path, tmpsubdirpath)
45
+ print("Relative Filepaths brother:", relative_path)
46
+ file_references.append(relative_path)
47
+
48
+ return all_texts, file_references
49
+
50
+
51
+
52
+ def _get_all_files_in_folder(temp_dir, folder_path):
53
+
54
+ all_files = []
55
+ target_dir = os.path.join(temp_dir, folder_path)
56
+
57
+ for root, dirs, files in os.walk(target_dir):
58
+ print(f"Files in current directory ({root}): {files}")
59
+ for file in files:
60
+ print(f"Processing file: {file}")
61
+ all_files.append(os.path.join(root, file))
62
+
63
+ return all_files
64
+
65
+ def _get_file(temp_dir, file_path):
66
+ full_path = os.path.join(temp_dir, file_path)
67
+ return full_path