broadfield-dev commited on
Commit
287b4ab
·
verified ·
1 Parent(s): 0a9dfc8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -51
app.py CHANGED
@@ -1,5 +1,5 @@
1
  from flask import Flask, render_template, request, jsonify, send_file
2
- from huggingface_hub import HfApi
3
  import requests
4
  import base64
5
  import markdown
@@ -7,7 +7,6 @@ import json
7
  import mimetypes
8
  import os
9
  import io
10
- import uuid
11
  from pathlib import Path
12
 
13
  app = Flask(__name__)
@@ -59,9 +58,11 @@ def get_all_files(owner, repo, path="", is_hf=False):
59
  print(f"Error fetching repository contents from {api_url}: {str(e)}")
60
  return None
61
 
62
- def get_hf_files(repo, name, path=""):
 
63
  api = HfApi()
64
  try:
 
65
  file_list = api.list_repo_files(repo_id=f'{repo}/{name}', repo_type="space")
66
  print(f"Files in {repo}/{name}: {file_list}")
67
  processed_files = []
@@ -70,30 +71,24 @@ def get_hf_files(repo, name, path=""):
70
  os.makedirs(name)
71
 
72
  for file_path in file_list:
73
- # Handle nested directories
74
- if "/" in file_path:
75
- dir_part, file_part = file_path.split("/", 1)
76
- dir_path = os.path.join(name, dir_part)
77
- if not os.path.exists(dir_path):
78
- os.makedirs(dir_path)
79
- if "/" in file_part:
80
- processed_files.extend(get_hf_files(repo, name, dir_part))
81
- continue
82
-
83
- # Fetch raw file content with authentication if needed (optional token)
84
  raw_url = f"https://huggingface.co/spaces/{repo}/{name}/raw/main/{file_path}"
85
  try:
86
  response = requests.get(raw_url, timeout=10)
87
  response.raise_for_status()
88
 
89
- # Ensure we get raw content, not HTML
90
- if response.headers.get('Content-Type', '').startswith('text/html'):
 
91
  print(f"Warning: Received HTML instead of raw content for {file_path}: {response.text[:100]}...")
92
  continue
 
 
 
93
 
94
- # Check if the response is a valid file (non-HTML, non-JSON)
95
- if not response.headers.get('Content-Type', '').startswith(('text/plain', 'application/octet-stream', 'text/')):
96
- print(f"Unexpected content type for {file_path}: {response.headers.get('Content-Type', '')}")
97
  continue
98
 
99
  except requests.exceptions.RequestException as e:
@@ -121,16 +116,26 @@ def get_hf_files(repo, name, path=""):
121
  print(f"Processed files: {processed_files}")
122
  return processed_files
123
 
 
 
 
124
  except Exception as e:
125
  print(f"Error processing Hugging Face files for {repo}/{name}: {str(e)}")
126
  return []
127
 
128
  def get_repo_contents(url):
129
- """Parse URL and fetch repository contents."""
130
  try:
131
- if "huggingface.co" in url:
132
  parts = url.rstrip('/').split('/')
133
  owner, repo = parts[-2], parts[-1]
 
 
 
 
 
 
 
134
  files = get_hf_files(owner, repo)
135
  if not files: # Empty list is valid, but check for errors
136
  raise Exception("No files found in the Hugging Face Space")
@@ -158,9 +163,10 @@ def process_file_content(file_info, owner, repo, is_hf=False):
158
  response.raise_for_status()
159
 
160
  # Ensure we get raw content, not HTML or JSON
161
- if response.headers.get('Content-Type', '').startswith('text/html'):
 
162
  raise Exception(f"Received HTML instead of raw content for {file_path}: {response.text[:100]}...")
163
- if response.headers.get('Content-Type', '').startswith('application/json'):
164
  raise Exception(f"Received JSON instead of raw content for {file_path}: {response.text[:100]}...")
165
 
166
  content_raw = response.content
@@ -292,41 +298,46 @@ def index():
292
 
293
  @app.route('/process', methods=['POST'])
294
  def process():
295
- # Ensure consistent response structure
296
  response_data = {'markdown': '', 'html': '', 'filename': '', 'error': None}
297
 
298
- if 'files[]' in request.files:
299
- files = request.files.getlist('files[]')
300
- if not files:
301
- response_data['error'] = 'No files uploaded'
302
- return jsonify(response_data), 400
303
-
304
- markdown_content = create_markdown_document(files=files)
305
- response_data['markdown'] = "```markdown\n" + markdown_content + "\n```"
306
- response_data['html'] = markdown.markdown(markdown_content)
307
- response_data['filename'] = "uploaded_files_summary.md"
308
- else:
309
- repo_url = request.json.get('repo_url')
310
- if not repo_url:
311
- response_data['error'] = 'Please provide a repository URL or upload files'
312
- return jsonify(response_data), 400
313
-
314
- markdown_content = create_markdown_document(repo_url)
315
- owner, repo, contents, is_hf = get_repo_contents(repo_url)
316
- if not owner:
317
- response_data['error'] = markdown_content # Error message from get_repo_contents
318
- return jsonify(response_data), 400
319
-
320
- response_data['markdown'] = markdown_content
321
- response_data['html'] = markdown.markdown(markdown_content)
322
- response_data['filename'] = f"{owner}_{repo}_summary.md"
 
 
 
 
 
323
 
324
  return jsonify(response_data)
325
 
326
  @app.route('/download', methods=['POST'])
327
  def download():
328
- markdown_content = request.json.get('markdown')
329
- filename = request.json.get('filename')
330
 
331
  buffer = io.BytesIO()
332
  buffer.write(markdown_content.encode('utf-8'))
 
1
  from flask import Flask, render_template, request, jsonify, send_file
2
+ from huggingface_hub import HfApi, HfApiError
3
  import requests
4
  import base64
5
  import markdown
 
7
  import mimetypes
8
  import os
9
  import io
 
10
  from pathlib import Path
11
 
12
  app = Flask(__name__)
 
58
  print(f"Error fetching repository contents from {api_url}: {str(e)}")
59
  return None
60
 
61
+ def get_hf_files(repo, name):
62
+ """Fetch all files from a Hugging Face Space with robust error handling."""
63
  api = HfApi()
64
  try:
65
+ # Use HfApi to list files, which is more reliable for Spaces
66
  file_list = api.list_repo_files(repo_id=f'{repo}/{name}', repo_type="space")
67
  print(f"Files in {repo}/{name}: {file_list}")
68
  processed_files = []
 
71
  os.makedirs(name)
72
 
73
  for file_path in file_list:
74
+ # Fetch raw file content with strict validation
 
 
 
 
 
 
 
 
 
 
75
  raw_url = f"https://huggingface.co/spaces/{repo}/{name}/raw/main/{file_path}"
76
  try:
77
  response = requests.get(raw_url, timeout=10)
78
  response.raise_for_status()
79
 
80
+ # Ensure we get raw content, not HTML or JSON
81
+ content_type = response.headers.get('Content-Type', '').lower()
82
+ if content_type.startswith('text/html'):
83
  print(f"Warning: Received HTML instead of raw content for {file_path}: {response.text[:100]}...")
84
  continue
85
+ if content_type.startswith('application/json'):
86
+ print(f"Warning: Received JSON instead of raw content for {file_path}: {response.text[:100]}...")
87
+ continue
88
 
89
+ # Verify it's a valid file (e.g., text/plain or binary)
90
+ if not content_type.startswith(('text/plain', 'application/octet-stream', 'text/')) and 'text/' not in content_type:
91
+ print(f"Unexpected content type for {file_path}: {content_type}")
92
  continue
93
 
94
  except requests.exceptions.RequestException as e:
 
116
  print(f"Processed files: {processed_files}")
117
  return processed_files
118
 
119
+ except HfApiError as e:
120
+ print(f"Hugging Face API error for {repo}/{name}: {str(e)}")
121
+ return []
122
  except Exception as e:
123
  print(f"Error processing Hugging Face files for {repo}/{name}: {str(e)}")
124
  return []
125
 
126
  def get_repo_contents(url):
127
+ """Parse URL and fetch repository contents with robust error handling."""
128
  try:
129
+ if "huggingface.co" in url.lower():
130
  parts = url.rstrip('/').split('/')
131
  owner, repo = parts[-2], parts[-1]
132
+ # Ensure the Space exists and is accessible
133
+ try:
134
+ api = HfApi()
135
+ api.list_repo_files(repo_id=f'{owner}/{repo}', repo_type="space") # Pre-check
136
+ except HfApiError as e:
137
+ raise Exception(f"Hugging Face Space not found or inaccessible: {str(e)}")
138
+
139
  files = get_hf_files(owner, repo)
140
  if not files: # Empty list is valid, but check for errors
141
  raise Exception("No files found in the Hugging Face Space")
 
163
  response.raise_for_status()
164
 
165
  # Ensure we get raw content, not HTML or JSON
166
+ content_type = response.headers.get('Content-Type', '').lower()
167
+ if content_type.startswith('text/html'):
168
  raise Exception(f"Received HTML instead of raw content for {file_path}: {response.text[:100]}...")
169
+ if content_type.startswith('application/json'):
170
  raise Exception(f"Received JSON instead of raw content for {file_path}: {response.text[:100]}...")
171
 
172
  content_raw = response.content
 
298
 
299
  @app.route('/process', methods=['POST'])
300
  def process():
301
+ # Ensure consistent response structure as JSON, even for errors
302
  response_data = {'markdown': '', 'html': '', 'filename': '', 'error': None}
303
 
304
+ try:
305
+ if 'files[]' in request.files:
306
+ files = request.files.getlist('files[]')
307
+ if not files:
308
+ response_data['error'] = 'No files uploaded'
309
+ return jsonify(response_data), 400
310
+
311
+ markdown_content = create_markdown_document(files=files)
312
+ response_data['markdown'] = "```markdown\n" + markdown_content + "\n```"
313
+ response_data['html'] = markdown.markdown(markdown_content)
314
+ response_data['filename'] = "uploaded_files_summary.md"
315
+ else:
316
+ repo_url = request.json.get('repo_url', '').strip()
317
+ if not repo_url:
318
+ response_data['error'] = 'Please provide a repository URL or upload files'
319
+ return jsonify(response_data), 400
320
+
321
+ markdown_content = create_markdown_document(repo_url)
322
+ owner, repo, contents, is_hf = get_repo_contents(repo_url)
323
+ if not owner:
324
+ response_data['error'] = markdown_content
325
+ return jsonify(response_data), 400
326
+
327
+ response_data['markdown'] = markdown_content
328
+ response_data['html'] = markdown.markdown(markdown_content)
329
+ response_data['filename'] = f"{owner}_{repo}_summary.md"
330
+
331
+ except Exception as e:
332
+ response_data['error'] = f"Server error processing request: {str(e)}"
333
+ return jsonify(response_data), 500
334
 
335
  return jsonify(response_data)
336
 
337
  @app.route('/download', methods=['POST'])
338
  def download():
339
+ markdown_content = request.json.get('markdown', '')
340
+ filename = request.json.get('filename', 'document.md')
341
 
342
  buffer = io.BytesIO()
343
  buffer.write(markdown_content.encode('utf-8'))