Quentin Gallouédec commited on
Commit
e6fad20
·
1 Parent(s): 94fb30f

allow api token

Browse files
Files changed (1) hide show
  1. app.py +57 -24
app.py CHANGED
@@ -68,13 +68,24 @@ def clean(text):
68
  return text
69
 
70
 
71
- def pdf2dataset(pathes, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
72
  progress(0, desc="Starting...")
73
  readers = []
74
  for path in pathes:
75
- if not path.endswith(".pdf"):
 
 
76
  raise gr.Error(f"Failed to read {path.split('/')[-1]}.")
77
- readers = [PdfReader(path) for path in pathes]
78
  num_pages = sum(len(reader.pages) for reader in readers)
79
  filenames = [path.split("/")[-1] for path in pathes]
80
 
@@ -93,48 +104,57 @@ def pdf2dataset(pathes, progress=gr.Progress()):
93
  # Upload the dataset to Hugging Face
94
  progress(0, desc="Uploading to Hugging Face...")
95
  dataset = Dataset.from_dict({"text": page_texts, "source": page_filenames})
96
- dataset_name = f"{random.getrandbits(128):x}"
97
- dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("HF_TOKEN"))
98
  progress(1, desc="Done!")
99
 
100
- instructions = instructions_template.substitute(dataset_name=dataset_name)
101
  preview = pd.DataFrame(dataset[:10])
102
- print(f"Dataset {dataset_name} uploaded successfully.")
103
- return instructions, preview, dataset_name
 
104
 
105
 
106
- def delete_dataset(dataset_name):
107
- api = HfApi()
108
- if "/" in dataset_name:
109
- user_id, dataset_name = dataset_name.split("/")
 
110
  else:
111
  user_id = "pdf2dataset"
 
 
 
 
112
  if not user_id == "pdf2dataset":
113
- print(f"Invalid namespace deteced in {dataset_name}.")
114
- return f"❌ Invalid namespace deteced: {user_id}"
115
- repo_id = f"{user_id}/{dataset_name}"
 
 
116
  try:
117
  api.delete_repo(repo_id, repo_type="dataset")
118
- print(f"Dataset {dataset_name} deleted successfully.")
119
  return "✅ Dataset deleted successfully."
120
  except Exception as e:
121
- print(f"Error deleting dataset{dataset_name}: {e}")
122
  return f"❌ Error deleting dataset: {e}"
123
 
124
 
125
  caution_text = """⚠️ Caution:
126
  - This process will upload your data to a public Hugging Face repository. Do not upload sensitive information.
127
  - Anyone (including you) will be able to delete the dataset once it is uploaded.
 
 
128
  """
129
 
130
  instructions_template = Template(
131
  """
132
- 🔗: https://huggingface.co/datasets/pdf2dataset/$dataset_name.
133
 
134
  ```python
135
  from datasets import load_dataset
136
 
137
- dataset = load_dataset("pdf2dataset/$dataset_name")
138
  ```
139
  """
140
  )
@@ -144,20 +164,33 @@ with gr.Blocks() as demo:
144
  gr.Markdown("## 1️⃣ Upload PDFs")
145
  file = gr.File(file_types=["pdf"], file_count="multiple")
146
  gr.Markdown(caution_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  gr.Markdown("## 2️⃣ Convert the PDFs and upload")
148
  convert_button = gr.Button("🔄 Convert and upload")
149
  preview = gr.Dataframe(
150
  label="Preview (first 10 rows)", headers=["text", "source"], datatype=["str", "str"], row_count=10, wrap=True, height=200
151
  )
152
  gr.Markdown("## 3️⃣ Use the dataset in your code")
153
- instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
154
  gr.Markdown("## 4️⃣ Delete the dataset (optional)")
155
- dataset_name_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete", label="Dataset to delete")
156
  delete_button = gr.Button("🗑️ Delete dataset")
157
 
158
  # Define the actions
159
- convert_button.click(pdf2dataset, inputs=[file], outputs=[instructions, preview, dataset_name_to_delete])
160
- delete_button.click(delete_dataset, inputs=[dataset_name_to_delete], outputs=[delete_button])
161
- dataset_name_to_delete.input(lambda: "🗑️ Delete dataset", outputs=[delete_button])
162
 
163
  demo.launch()
 
68
  return text
69
 
70
 
71
+ def pdf2dataset(pathes, user_id, dataset_id, token, progress=gr.Progress()):
72
+ if any([user_id, dataset_id, token]) and not all([user_id, dataset_id, token]):
73
+ raise gr.Error("Please provide all three: User ID, Dataset ID, and API token.")
74
+
75
+ if user_id == "":
76
+ user_id = "pdf2dataset"
77
+ if dataset_id == "":
78
+ dataset_id = f"{random.getrandbits(128):x}"
79
+ if token == "":
80
+ token = os.getenv("HF_TOKEN")
81
+
82
  progress(0, desc="Starting...")
83
  readers = []
84
  for path in pathes:
85
+ try:
86
+ readers.append(PdfReader(path))
87
+ except Exception as e:
88
  raise gr.Error(f"Failed to read {path.split('/')[-1]}.")
 
89
  num_pages = sum(len(reader.pages) for reader in readers)
90
  filenames = [path.split("/")[-1] for path in pathes]
91
 
 
104
  # Upload the dataset to Hugging Face
105
  progress(0, desc="Uploading to Hugging Face...")
106
  dataset = Dataset.from_dict({"text": page_texts, "source": page_filenames})
107
+ dataset.push_to_hub(f"{user_id}/{dataset_id}", token=token)
 
108
  progress(1, desc="Done!")
109
 
110
+ instructions = instructions_template.substitute(user_id=user_id, dataset_id=dataset_id)
111
  preview = pd.DataFrame(dataset[:10])
112
+ print(f"Dataset {dataset_id} uploaded successfully.")
113
+ delete_dataset_id = dataset_id if user_id == "pdf2dataset" else ""
114
+ return instructions, preview, delete_dataset_id
115
 
116
 
117
+ def delete_dataset(repo_id_or_dataset_id):
118
+ # Get the user_id, dataset_id
119
+ if "/" in repo_id_or_dataset_id:
120
+ user_id, dataset_id = repo_id_or_dataset_id.split("/")
121
+ repo_id = repo_id_or_dataset_id
122
  else:
123
  user_id = "pdf2dataset"
124
+ dataset_id = repo_id_or_dataset_id
125
+ repo_id = f"{user_id}/{dataset_id}"
126
+
127
+ # Only allow the deletion of datasets in the pdf2dataset namespace
128
  if not user_id == "pdf2dataset":
129
+ print(f"Deleting datasets in the {user_id} namespace is not allowed.")
130
+ return f"❌ Deleting datasets in the {user_id} namespace is not allowed."
131
+
132
+ # Delete the dataset
133
+ api = HfApi()
134
  try:
135
  api.delete_repo(repo_id, repo_type="dataset")
136
+ print(f"Dataset {repo_id} deleted successfully.")
137
  return "✅ Dataset deleted successfully."
138
  except Exception as e:
139
+ print(f"Error deleting dataset{repo_id}: {e}")
140
  return f"❌ Error deleting dataset: {e}"
141
 
142
 
143
  caution_text = """⚠️ Caution:
144
  - This process will upload your data to a public Hugging Face repository. Do not upload sensitive information.
145
  - Anyone (including you) will be able to delete the dataset once it is uploaded.
146
+
147
+ To avoid this, you can push the dataset to your personal Hugging Face account ⬇️
148
  """
149
 
150
  instructions_template = Template(
151
  """
152
+ 🔗: https://huggingface.co/datasets/$user_id/$dataset_id.
153
 
154
  ```python
155
  from datasets import load_dataset
156
 
157
+ dataset = load_dataset("$user_id/$dataset_id")
158
  ```
159
  """
160
  )
 
164
  gr.Markdown("## 1️⃣ Upload PDFs")
165
  file = gr.File(file_types=["pdf"], file_count="multiple")
166
  gr.Markdown(caution_text)
167
+ with gr.Accordion("🔒 Pushing to my personal Hugging Face account", open=False):
168
+ gr.Markdown(
169
+ """Recommended for API token
170
+ - Go to https://huggingface.co/settings/tokens?new_token=true
171
+ - Choose "Fine-grained"
172
+ - Check only _**Repos**/Write access to contents/settings of all repos under your personal namespace_
173
+ - Revoke the token after use"""
174
+ )
175
+ with gr.Row():
176
+ user_id = gr.Textbox(label="User ID", placeholder="Enter your Hugging Face user ID")
177
+ dataset_id = gr.Textbox(label="Dataset ID", placeholder="Enter the desired dataset ID")
178
+ token = gr.Textbox(label="API token", placeholder="Enter a Hugging Face API token")
179
+
180
  gr.Markdown("## 2️⃣ Convert the PDFs and upload")
181
  convert_button = gr.Button("🔄 Convert and upload")
182
  preview = gr.Dataframe(
183
  label="Preview (first 10 rows)", headers=["text", "source"], datatype=["str", "str"], row_count=10, wrap=True, height=200
184
  )
185
  gr.Markdown("## 3️⃣ Use the dataset in your code")
186
+ instructions = gr.Markdown(instructions_template.substitute(user_id="pdf2dataset", dataset_id="generated_dataset_id"))
187
  gr.Markdown("## 4️⃣ Delete the dataset (optional)")
188
+ dataset_id_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete", label="Dataset to delete")
189
  delete_button = gr.Button("🗑️ Delete dataset")
190
 
191
  # Define the actions
192
+ convert_button.click(pdf2dataset, inputs=[file, user_id, dataset_id, token], outputs=[instructions, preview, dataset_id_to_delete])
193
+ delete_button.click(delete_dataset, inputs=[dataset_id_to_delete], outputs=[delete_button])
194
+ dataset_id_to_delete.input(lambda: "🗑️ Delete dataset", outputs=[delete_button])
195
 
196
  demo.launch()