Quentin Gallouédec commited on
Commit
94fb30f
·
1 Parent(s): d018cea

multiple pdfs

Browse files
Files changed (1) hide show
  1. app.py +32 -21
app.py CHANGED
@@ -68,28 +68,39 @@ def clean(text):
68
  return text
69
 
70
 
71
- def pdf2dataset(file, progress=gr.Progress()):
72
  progress(0, desc="Starting...")
73
- reader = PdfReader(file)
74
- num_pages = len(reader.pages)
75
- dataset_name = f"{random.getrandbits(128):x}"
 
 
 
 
 
 
76
  page_texts = []
77
-
78
- for page in progress.tqdm(reader.pages, total=num_pages, desc="Converting pages"):
79
- page_text = page.extract_text()
80
- page_text = clean(page_text)
81
- page_texts.append(page_text)
82
-
 
 
 
 
 
83
  progress(0, desc="Uploading to Hugging Face...")
84
- dataset = Dataset.from_dict({"text": page_texts})
85
- dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN"))
 
86
  progress(1, desc="Done!")
87
 
88
- instrctions = instructions_template.substitute(dataset_name=dataset_name)
89
- preview = dataset["text"][:10]
90
- preview = pd.DataFrame(preview, columns=["text"])
91
  print(f"Dataset {dataset_name} uploaded successfully.")
92
- return instrctions, preview, dataset_name
93
 
94
 
95
  def delete_dataset(dataset_name):
@@ -130,17 +141,17 @@ dataset = load_dataset("pdf2dataset/$dataset_name")
130
 
131
  with gr.Blocks() as demo:
132
  gr.Markdown("# PDF to 🤗 Dataset")
133
- gr.Markdown("## 1️⃣ Upload a PDF")
134
- file = gr.File(file_types=["pdf"], height=50)
135
  gr.Markdown(caution_text)
136
- gr.Markdown("## 2️⃣ Convert the PDF and upload")
137
  convert_button = gr.Button("🔄 Convert and upload")
138
  preview = gr.Dataframe(
139
- label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200
140
  )
141
  gr.Markdown("## 3️⃣ Use the dataset in your code")
142
  instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
143
- gr.Markdown("## 4️⃣ Delete the (optional)")
144
  dataset_name_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete", label="Dataset to delete")
145
  delete_button = gr.Button("🗑️ Delete dataset")
146
 
 
68
  return text
69
 
70
 
71
+ def pdf2dataset(pathes, progress=gr.Progress()):
72
  progress(0, desc="Starting...")
73
+ readers = []
74
+ for path in pathes:
75
+ if not path.endswith(".pdf"):
76
+ raise gr.Error(f"Failed to read {path.split('/')[-1]}.")
77
+ readers = [PdfReader(path) for path in pathes]
78
+ num_pages = sum(len(reader.pages) for reader in readers)
79
+ filenames = [path.split("/")[-1] for path in pathes]
80
+
81
+ # Convert the PDFs to text
82
  page_texts = []
83
+ page_filenames = []
84
+ progress(0, desc="Converting pages...")
85
+ for reader, filename in zip(readers, filenames):
86
+ for page in reader.pages:
87
+ page_text = page.extract_text()
88
+ page_text = clean(page_text)
89
+ page_texts.append(page_text)
90
+ page_filenames.append(filename)
91
+ progress(len(page_texts) / num_pages, desc="Converting pages...")
92
+
93
+ # Upload the dataset to Hugging Face
94
  progress(0, desc="Uploading to Hugging Face...")
95
+ dataset = Dataset.from_dict({"text": page_texts, "source": page_filenames})
96
+ dataset_name = f"{random.getrandbits(128):x}"
97
+ dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("HF_TOKEN"))
98
  progress(1, desc="Done!")
99
 
100
+ instructions = instructions_template.substitute(dataset_name=dataset_name)
101
+ preview = pd.DataFrame(dataset[:10])
 
102
  print(f"Dataset {dataset_name} uploaded successfully.")
103
+ return instructions, preview, dataset_name
104
 
105
 
106
  def delete_dataset(dataset_name):
 
141
 
142
  with gr.Blocks() as demo:
143
  gr.Markdown("# PDF to 🤗 Dataset")
144
+ gr.Markdown("## 1️⃣ Upload PDFs")
145
+ file = gr.File(file_types=["pdf"], file_count="multiple")
146
  gr.Markdown(caution_text)
147
+ gr.Markdown("## 2️⃣ Convert the PDFs and upload")
148
  convert_button = gr.Button("🔄 Convert and upload")
149
  preview = gr.Dataframe(
150
+ label="Preview (first 10 rows)", headers=["text", "source"], datatype=["str", "str"], row_count=10, wrap=True, height=200
151
  )
152
  gr.Markdown("## 3️⃣ Use the dataset in your code")
153
  instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
154
+ gr.Markdown("## 4️⃣ Delete the dataset (optional)")
155
  dataset_name_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete", label="Dataset to delete")
156
  delete_button = gr.Button("🗑️ Delete dataset")
157