Spaces:
Runtime error
Runtime error
Upload extract_imgs.ipynb
Browse files- extract_imgs.ipynb +231 -0
extract_imgs.ipynb
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 4,
|
6 |
+
"id": "1d6f0077-6e41-4b23-8f85-14dba8160036",
|
7 |
+
"metadata": {
|
8 |
+
"tags": []
|
9 |
+
},
|
10 |
+
"outputs": [],
|
11 |
+
"source": [
|
12 |
+
"import fitz\n",
|
13 |
+
"# from PIL import Image \n",
|
14 |
+
"import os\n",
|
15 |
+
"import json"
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"cell_type": "code",
|
20 |
+
"execution_count": 5,
|
21 |
+
"id": "efea9ac6-7e23-49ca-91e2-f589c94a0f6d",
|
22 |
+
"metadata": {
|
23 |
+
"tags": []
|
24 |
+
},
|
25 |
+
"outputs": [
|
26 |
+
{
|
27 |
+
"name": "stdout",
|
28 |
+
"output_type": "stream",
|
29 |
+
"text": [
|
30 |
+
"C:\\Users\\user\\Desktop\\app_open\\app\n",
|
31 |
+
"C:\\Users\\user\\Desktop\\app_open\\app\\LLaVA.pdf\n",
|
32 |
+
"C:\\Users\\user\\Desktop\\app_open\\app\\Interior.pdf\n"
|
33 |
+
]
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"source": [
|
37 |
+
"pwd = os.getcwd()\n",
|
38 |
+
"source = os.path.join(pwd, 'app')\n",
|
39 |
+
"print(source)\n",
|
40 |
+
"file_1= os.path.join(source,'LLaVA.pdf')\n",
|
41 |
+
"file_2= os.path.join(source,'Interior.pdf')\n",
|
42 |
+
"print(file_1)\n",
|
43 |
+
"print(file_2)"
|
44 |
+
]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"cell_type": "code",
|
48 |
+
"execution_count": 32,
|
49 |
+
"id": "3fd7293a-b835-4803-8e2f-1b546d75adc3",
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"#source_files : 현재 경로\n",
|
54 |
+
"#file_ : 이미지 추출할 pdf 경로\n",
|
55 |
+
"def load_pdf(source_files, file_, pdf_name):\n",
|
56 |
+
"\t# open the file \n",
|
57 |
+
"\tpdf_file = fitz.open(file_) \n",
|
58 |
+
"\timage_counter = 0\n",
|
59 |
+
"\tmetadata = {}\n",
|
60 |
+
"\tfor page_index in range(0,len(pdf_file)): \n",
|
61 |
+
"\t\t# get the page itself \n",
|
62 |
+
"\t\tpage = pdf_file[page_index] \n",
|
63 |
+
"\t\t# get block details from the page\n",
|
64 |
+
"\t\tblocks = pdf_file[page_index].get_text(\"blocks\")\n",
|
65 |
+
"\t\t# get image info and title details \n",
|
66 |
+
"\t\timage_meta = [ (blocks[i][4], blocks[i+1][4]) for i in range(0,len(blocks)) if blocks[i][-1]==1 ] \t\n",
|
67 |
+
"\t\timage_info= [ image_meta[0][0] if image_meta else []] \n",
|
68 |
+
"\t\timage_title = [ image_meta[0][1] if image_meta else []] \n",
|
69 |
+
"\t\t# prepare image meta data from the page\n",
|
70 |
+
"\t\tfor image in page.get_images():\n",
|
71 |
+
"\t\t\timage_id = image[7] # img<no>\n",
|
72 |
+
"\t\t\timage_block_id = image[0] # block number \n",
|
73 |
+
"\t\t\timage_title_block_id = image_block_id+1 # image title block number\n",
|
74 |
+
"\t\t\timage_dim = image[2],image[3] # image dimension details\n",
|
75 |
+
"\t\t\t\n",
|
76 |
+
"\t\t\tprint(f\"[+] Page:{page_index}, Image : {image_id}, Block:{image_block_id}, Image Dim:{image_dim}\")\n",
|
77 |
+
"\t\t\timage_counter = image_counter+1\n",
|
78 |
+
"\n",
|
79 |
+
"\t\t\t# Update metadata dictionary with image information\n",
|
80 |
+
"\t\t\tmetadata[image_counter] = {\n",
|
81 |
+
"\t\t\t\t'page': page_index,\n",
|
82 |
+
"\t\t\t\t'image': image_id,\n",
|
83 |
+
"\t\t\t\t'block': image_block_id,\n",
|
84 |
+
"\t\t\t\t'image_dim': image_dim,\n",
|
85 |
+
"\t\t\t\t'image_info': str(image_info[0]),\n",
|
86 |
+
"\t\t\t\t'image_title': str(image_title[0]),\n",
|
87 |
+
"\t\t\t\t'image_file': f\"{image_id}_{image_block_id}.png\",\n",
|
88 |
+
"\t\t\t\t'image_path': os.path.join(source_files, f\"{image_id}_{image_block_id}.png\")\n",
|
89 |
+
"\t\t\t}\n",
|
90 |
+
"\t\t# save the images to the local file system\n",
|
91 |
+
"\t\t\tpix = fitz.Pixmap(pdf_file, image[0])\n",
|
92 |
+
"\t\t\t# image file name contains image name 'img<no>' and block number\n",
|
93 |
+
" #pix.save(os.path.join(source_files, f\"{image_id}_{image_block_id}.png\"))\n",
|
94 |
+
"\t\t\tpix.save(os.path.join(source_files+'/'+pdf_name, f\"{image_id}_{image_block_id}.png\"))\n",
|
95 |
+
"\tprint(f\"Total Images: {image_counter}\")\n",
|
96 |
+
"\t\n",
|
97 |
+
"\twith open(os.path.join(source, f'metadata.json'),'w') as f:\n",
|
98 |
+
"\t\tjson.dump(metadata,f)\n",
|
99 |
+
"\treturn metadata"
|
100 |
+
]
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"cell_type": "code",
|
104 |
+
"execution_count": 33,
|
105 |
+
"id": "256db277",
|
106 |
+
"metadata": {},
|
107 |
+
"outputs": [
|
108 |
+
{
|
109 |
+
"name": "stdout",
|
110 |
+
"output_type": "stream",
|
111 |
+
"text": [
|
112 |
+
"[+] Page:0, Image : Im0, Block:726, Image Dim:(663, 268)\n",
|
113 |
+
"[+] Page:0, Image : Im1, Block:727, Image Dim:(600, 400)\n",
|
114 |
+
"[+] Page:1, Image : Im0, Block:141, Image Dim:(660, 375)\n",
|
115 |
+
"[+] Page:1, Image : Im0, Block:132, Image Dim:(660, 375)\n",
|
116 |
+
"[+] Page:2, Image : Im0, Block:206, Image Dim:(596, 398)\n",
|
117 |
+
"[+] Page:2, Image : Im0, Block:164, Image Dim:(596, 398)\n",
|
118 |
+
"[+] Page:2, Image : Im0, Block:150, Image Dim:(596, 398)\n",
|
119 |
+
"[+] Page:2, Image : Im0, Block:155, Image Dim:(596, 398)\n",
|
120 |
+
"[+] Page:2, Image : Im0, Block:203, Image Dim:(596, 398)\n",
|
121 |
+
"[+] Page:2, Image : Im0, Block:12, Image Dim:(596, 398)\n",
|
122 |
+
"[+] Page:2, Image : Im0, Block:160, Image Dim:(596, 398)\n",
|
123 |
+
"[+] Page:4, Image : Im0, Block:20, Image Dim:(1621, 1080)\n",
|
124 |
+
"[+] Page:5, Image : Im0, Block:23, Image Dim:(1620, 1080)\n",
|
125 |
+
"[+] Page:6, Image : Im0, Block:26, Image Dim:(1620, 1080)\n",
|
126 |
+
"[+] Page:8, Image : Im0, Block:32, Image Dim:(600, 400)\n",
|
127 |
+
"[+] Page:8, Image : Im1, Block:33, Image Dim:(1620, 1080)\n",
|
128 |
+
"[+] Page:9, Image : Im0, Block:36, Image Dim:(1621, 1080)\n",
|
129 |
+
"[+] Page:10, Image : Im0, Block:39, Image Dim:(1621, 1080)\n",
|
130 |
+
"[+] Page:12, Image : Im0, Block:45, Image Dim:(1621, 1080)\n",
|
131 |
+
"[+] Page:13, Image : Im0, Block:48, Image Dim:(1620, 1080)\n",
|
132 |
+
"[+] Page:14, Image : Im0, Block:51, Image Dim:(1620, 1080)\n",
|
133 |
+
"[+] Page:16, Image : Im0, Block:57, Image Dim:(1620, 1080)\n",
|
134 |
+
"[+] Page:17, Image : Im0, Block:60, Image Dim:(1620, 1080)\n",
|
135 |
+
"[+] Page:18, Image : Im0, Block:63, Image Dim:(1621, 1080)\n",
|
136 |
+
"[+] Page:20, Image : Im0, Block:69, Image Dim:(1621, 1080)\n",
|
137 |
+
"[+] Page:21, Image : Im0, Block:72, Image Dim:(1620, 1080)\n",
|
138 |
+
"[+] Page:22, Image : Im0, Block:75, Image Dim:(1620, 1080)\n",
|
139 |
+
"[+] Page:24, Image : Im0, Block:81, Image Dim:(1621, 1080)\n",
|
140 |
+
"[+] Page:25, Image : Im0, Block:84, Image Dim:(1620, 1080)\n",
|
141 |
+
"[+] Page:26, Image : Im0, Block:87, Image Dim:(1620, 1080)\n",
|
142 |
+
"[+] Page:28, Image : Im0, Block:93, Image Dim:(1620, 1080)\n",
|
143 |
+
"[+] Page:29, Image : Im0, Block:96, Image Dim:(1620, 1080)\n",
|
144 |
+
"[+] Page:30, Image : Im0, Block:99, Image Dim:(1573, 1051)\n",
|
145 |
+
"[+] Page:32, Image : Im0, Block:109, Image Dim:(161, 159)\n",
|
146 |
+
"Total Images: 34\n",
|
147 |
+
"[+] Page:2, Image : Im1, Block:216, Image Dim:(2474, 1547)\n",
|
148 |
+
"[+] Page:5, Image : Im3, Block:354, Image Dim:(1657, 1112)\n",
|
149 |
+
"[+] Page:7, Image : Im4, Block:390, Image Dim:(550, 550)\n",
|
150 |
+
"[+] Page:7, Image : Im5, Block:391, Image Dim:(1432, 909)\n",
|
151 |
+
"[+] Page:14, Image : Im6, Block:572, Image Dim:(863, 1030)\n",
|
152 |
+
"[+] Page:15, Image : Image11, Block:596, Image Dim:(514, 514)\n",
|
153 |
+
"[+] Page:15, Image : Image12, Block:597, Image Dim:(782, 446)\n",
|
154 |
+
"[+] Page:15, Image : Image13, Block:598, Image Dim:(782, 446)\n",
|
155 |
+
"[+] Page:15, Image : Image16, Block:599, Image Dim:(119, 132)\n",
|
156 |
+
"[+] Page:15, Image : Image9, Block:595, Image Dim:(104, 104)\n",
|
157 |
+
"[+] Page:16, Image : Image11, Block:617, Image Dim:(547, 400)\n",
|
158 |
+
"[+] Page:16, Image : Image12, Block:618, Image Dim:(119, 132)\n",
|
159 |
+
"[+] Page:16, Image : Image9, Block:616, Image Dim:(104, 104)\n",
|
160 |
+
"[+] Page:16, Image : Image10, Block:628, Image Dim:(104, 104)\n",
|
161 |
+
"[+] Page:16, Image : Image12, Block:629, Image Dim:(119, 132)\n",
|
162 |
+
"[+] Page:16, Image : Image9, Block:627, Image Dim:(599, 400)\n",
|
163 |
+
"[+] Page:17, Image : Image11, Block:644, Image Dim:(355, 227)\n",
|
164 |
+
"[+] Page:17, Image : Image12, Block:645, Image Dim:(120, 132)\n",
|
165 |
+
"[+] Page:17, Image : Image9, Block:643, Image Dim:(104, 104)\n",
|
166 |
+
"[+] Page:17, Image : Image11, Block:656, Image Dim:(363, 315)\n",
|
167 |
+
"[+] Page:17, Image : Image12, Block:657, Image Dim:(531, 400)\n",
|
168 |
+
"[+] Page:17, Image : Image15, Block:658, Image Dim:(119, 132)\n",
|
169 |
+
"[+] Page:17, Image : Image9, Block:655, Image Dim:(104, 104)\n",
|
170 |
+
"[+] Page:18, Image : Im13, Block:580, Image Dim:(116, 177)\n",
|
171 |
+
"[+] Page:18, Image : Image13, Block:679, Image Dim:(119, 132)\n",
|
172 |
+
"[+] Page:18, Image : Image14, Block:680, Image Dim:(329, 329)\n",
|
173 |
+
"[+] Page:18, Image : Image15, Block:681, Image Dim:(333, 327)\n",
|
174 |
+
"[+] Page:18, Image : Image9, Block:678, Image Dim:(104, 104)\n",
|
175 |
+
"[+] Page:22, Image : Im1, Block:216, Image Dim:(2474, 1547)\n",
|
176 |
+
"Total Images: 29\n"
|
177 |
+
]
|
178 |
+
}
|
179 |
+
],
|
180 |
+
"source": [
|
181 |
+
"#source 경로 내 파일 리스트 생성\n",
|
182 |
+
"file_list = os.listdir(source)\n",
|
183 |
+
"\n",
|
184 |
+
"#pdf 리스트 추출\n",
|
185 |
+
"pdf_list=[]\n",
|
186 |
+
"[pdf_list.append(x) for x in file_list if x[-3:]=='pdf']\n",
|
187 |
+
"\n",
|
188 |
+
"#pdf명 파일 생성\n",
|
189 |
+
"for x in pdf_list:\n",
|
190 |
+
" if x[-3:]=='pdf':\n",
|
191 |
+
" if os.path.isdir(source+\"/\"+str(x[:-4])) == True :\n",
|
192 |
+
" pass #파일이 존재하면 pass\n",
|
193 |
+
" else:\n",
|
194 |
+
" os.mkdir(source+\"/\"+str(x[:-4]))\n",
|
195 |
+
"\n",
|
196 |
+
"#pdf에서 이미지 추출해서 pdf 파일명의 폴더에 저장\n",
|
197 |
+
"for i in pdf_list:\n",
|
198 |
+
" load_pdf(source, source+\"/\"+i,str(i[:-4]))"
|
199 |
+
]
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"cell_type": "code",
|
203 |
+
"execution_count": null,
|
204 |
+
"id": "0f5c21f0-2a9f-49cc-8699-65af38e58ee3",
|
205 |
+
"metadata": {},
|
206 |
+
"outputs": [],
|
207 |
+
"source": []
|
208 |
+
}
|
209 |
+
],
|
210 |
+
"metadata": {
|
211 |
+
"kernelspec": {
|
212 |
+
"display_name": "Python 3 (ipykernel)",
|
213 |
+
"language": "python",
|
214 |
+
"name": "python3"
|
215 |
+
},
|
216 |
+
"language_info": {
|
217 |
+
"codemirror_mode": {
|
218 |
+
"name": "ipython",
|
219 |
+
"version": 3
|
220 |
+
},
|
221 |
+
"file_extension": ".py",
|
222 |
+
"mimetype": "text/x-python",
|
223 |
+
"name": "python",
|
224 |
+
"nbconvert_exporter": "python",
|
225 |
+
"pygments_lexer": "ipython3",
|
226 |
+
"version": "3.10.13"
|
227 |
+
}
|
228 |
+
},
|
229 |
+
"nbformat": 4,
|
230 |
+
"nbformat_minor": 5
|
231 |
+
}
|