Spaces:

broadfield-dev
/

repo_to_md

Running

App Files Files Community

broadfield-dev commited on Feb 26

Commit

287b4ab

verified ·

1 Parent(s): 0a9dfc8

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -51

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from flask import Flask, render_template, request, jsonify, send_file
-from huggingface_hub import HfApi
 import requests
 import base64
 import markdown
@@ -7,7 +7,6 @@ import json
 import mimetypes
 import os
 import io
-import uuid
 from pathlib import Path
 app = Flask(__name__)
@@ -59,9 +58,11 @@ def get_all_files(owner, repo, path="", is_hf=False):
         print(f"Error fetching repository contents from {api_url}: {str(e)}")
         return None
-def get_hf_files(repo, name, path=""):
     api = HfApi()
     try:
         file_list = api.list_repo_files(repo_id=f'{repo}/{name}', repo_type="space")
         print(f"Files in {repo}/{name}: {file_list}")
         processed_files = []
@@ -70,30 +71,24 @@ def get_hf_files(repo, name, path=""):
             os.makedirs(name)
         for file_path in file_list:
-            # Handle nested directories
-            if "/" in file_path:
-                dir_part, file_part = file_path.split("/", 1)
-                dir_path = os.path.join(name, dir_part)
-                if not os.path.exists(dir_path):
-                    os.makedirs(dir_path)
-                if "/" in file_part:
-                    processed_files.extend(get_hf_files(repo, name, dir_part))
-                    continue
-            # Fetch raw file content with authentication if needed (optional token)
             raw_url = f"https://huggingface.co/spaces/{repo}/{name}/raw/main/{file_path}"
             try:
                 response = requests.get(raw_url, timeout=10)
                 response.raise_for_status()
-                # Ensure we get raw content, not HTML
-                if response.headers.get('Content-Type', '').startswith('text/html'):
                     print(f"Warning: Received HTML instead of raw content for {file_path}: {response.text[:100]}...")
                     continue
-                # Check if the response is a valid file (non-HTML, non-JSON)
-                if not response.headers.get('Content-Type', '').startswith(('text/plain', 'application/octet-stream', 'text/')):
-                    print(f"Unexpected content type for {file_path}: {response.headers.get('Content-Type', '')}")
                     continue
             except requests.exceptions.RequestException as e:
@@ -121,16 +116,26 @@ def get_hf_files(repo, name, path=""):
         print(f"Processed files: {processed_files}")
         return processed_files
     except Exception as e:
         print(f"Error processing Hugging Face files for {repo}/{name}: {str(e)}")
         return []
 def get_repo_contents(url):
-    """Parse URL and fetch repository contents."""
     try:
-        if "huggingface.co" in url:
             parts = url.rstrip('/').split('/')
             owner, repo = parts[-2], parts[-1]
             files = get_hf_files(owner, repo)
             if not files:  # Empty list is valid, but check for errors
                 raise Exception("No files found in the Hugging Face Space")
@@ -158,9 +163,10 @@ def process_file_content(file_info, owner, repo, is_hf=False):
             response.raise_for_status()
             # Ensure we get raw content, not HTML or JSON
-            if response.headers.get('Content-Type', '').startswith('text/html'):
                 raise Exception(f"Received HTML instead of raw content for {file_path}: {response.text[:100]}...")
-            if response.headers.get('Content-Type', '').startswith('application/json'):
                 raise Exception(f"Received JSON instead of raw content for {file_path}: {response.text[:100]}...")
             content_raw = response.content
@@ -292,41 +298,46 @@ def index():
 @app.route('/process', methods=['POST'])
 def process():
-    # Ensure consistent response structure
     response_data = {'markdown': '', 'html': '', 'filename': '', 'error': None}
-    if 'files[]' in request.files:
-        files = request.files.getlist('files[]')
-        if not files:
-            response_data['error'] = 'No files uploaded'
-            return jsonify(response_data), 400
-        markdown_content = create_markdown_document(files=files)
-        response_data['markdown'] = "```markdown\n" + markdown_content + "\n```"
-        response_data['html'] = markdown.markdown(markdown_content)
-        response_data['filename'] = "uploaded_files_summary.md"
-    else:
-        repo_url = request.json.get('repo_url')
-        if not repo_url:
-            response_data['error'] = 'Please provide a repository URL or upload files'
-            return jsonify(response_data), 400
-        markdown_content = create_markdown_document(repo_url)
-        owner, repo, contents, is_hf = get_repo_contents(repo_url)
-        if not owner:
-            response_data['error'] = markdown_content  # Error message from get_repo_contents
-            return jsonify(response_data), 400
-        response_data['markdown'] = markdown_content
-        response_data['html'] = markdown.markdown(markdown_content)
-        response_data['filename'] = f"{owner}_{repo}_summary.md"
     return jsonify(response_data)
 @app.route('/download', methods=['POST'])
 def download():
-    markdown_content = request.json.get('markdown')
-    filename = request.json.get('filename')
     buffer = io.BytesIO()
     buffer.write(markdown_content.encode('utf-8'))

 from flask import Flask, render_template, request, jsonify, send_file
+from huggingface_hub import HfApi, HfApiError
 import requests
 import base64
 import markdown
 import mimetypes
 import os
 import io
 from pathlib import Path
 app = Flask(__name__)
         print(f"Error fetching repository contents from {api_url}: {str(e)}")
         return None
+def get_hf_files(repo, name):
+    """Fetch all files from a Hugging Face Space with robust error handling."""
     api = HfApi()
     try:
+        # Use HfApi to list files, which is more reliable for Spaces
         file_list = api.list_repo_files(repo_id=f'{repo}/{name}', repo_type="space")
         print(f"Files in {repo}/{name}: {file_list}")
         processed_files = []
             os.makedirs(name)
         for file_path in file_list:
+            # Fetch raw file content with strict validation
             raw_url = f"https://huggingface.co/spaces/{repo}/{name}/raw/main/{file_path}"
             try:
                 response = requests.get(raw_url, timeout=10)
                 response.raise_for_status()
+                # Ensure we get raw content, not HTML or JSON
+                content_type = response.headers.get('Content-Type', '').lower()
+                if content_type.startswith('text/html'):
                     print(f"Warning: Received HTML instead of raw content for {file_path}: {response.text[:100]}...")
                     continue
+                if content_type.startswith('application/json'):
+                    print(f"Warning: Received JSON instead of raw content for {file_path}: {response.text[:100]}...")
+                    continue
+                # Verify it's a valid file (e.g., text/plain or binary)
+                if not content_type.startswith(('text/plain', 'application/octet-stream', 'text/')) and 'text/' not in content_type:
+                    print(f"Unexpected content type for {file_path}: {content_type}")
                     continue
             except requests.exceptions.RequestException as e:
         print(f"Processed files: {processed_files}")
         return processed_files
+    except HfApiError as e:
+        print(f"Hugging Face API error for {repo}/{name}: {str(e)}")
+        return []
     except Exception as e:
         print(f"Error processing Hugging Face files for {repo}/{name}: {str(e)}")
         return []
 def get_repo_contents(url):
+    """Parse URL and fetch repository contents with robust error handling."""
     try:
+        if "huggingface.co" in url.lower():
             parts = url.rstrip('/').split('/')
             owner, repo = parts[-2], parts[-1]
+            # Ensure the Space exists and is accessible
+            try:
+                api = HfApi()
+                api.list_repo_files(repo_id=f'{owner}/{repo}', repo_type="space")  # Pre-check
+            except HfApiError as e:
+                raise Exception(f"Hugging Face Space not found or inaccessible: {str(e)}")
             files = get_hf_files(owner, repo)
             if not files:  # Empty list is valid, but check for errors
                 raise Exception("No files found in the Hugging Face Space")
             response.raise_for_status()
             # Ensure we get raw content, not HTML or JSON
+            content_type = response.headers.get('Content-Type', '').lower()
+            if content_type.startswith('text/html'):
                 raise Exception(f"Received HTML instead of raw content for {file_path}: {response.text[:100]}...")
+            if content_type.startswith('application/json'):
                 raise Exception(f"Received JSON instead of raw content for {file_path}: {response.text[:100]}...")
             content_raw = response.content
 @app.route('/process', methods=['POST'])
 def process():
+    # Ensure consistent response structure as JSON, even for errors
     response_data = {'markdown': '', 'html': '', 'filename': '', 'error': None}
+    try:
+        if 'files[]' in request.files:
+            files = request.files.getlist('files[]')
+            if not files:
+                response_data['error'] = 'No files uploaded'
+                return jsonify(response_data), 400
+            markdown_content = create_markdown_document(files=files)
+            response_data['markdown'] = "```markdown\n" + markdown_content + "\n```"
+            response_data['html'] = markdown.markdown(markdown_content)
+            response_data['filename'] = "uploaded_files_summary.md"
+        else:
+            repo_url = request.json.get('repo_url', '').strip()
+            if not repo_url:
+                response_data['error'] = 'Please provide a repository URL or upload files'
+                return jsonify(response_data), 400
+            markdown_content = create_markdown_document(repo_url)
+            owner, repo, contents, is_hf = get_repo_contents(repo_url)
+            if not owner:
+                response_data['error'] = markdown_content
+                return jsonify(response_data), 400
+            response_data['markdown'] = markdown_content
+            response_data['html'] = markdown.markdown(markdown_content)
+            response_data['filename'] = f"{owner}_{repo}_summary.md"
+    except Exception as e:
+        response_data['error'] = f"Server error processing request: {str(e)}"
+        return jsonify(response_data), 500
     return jsonify(response_data)
 @app.route('/download', methods=['POST'])
 def download():
+    markdown_content = request.json.get('markdown', '')
+    filename = request.json.get('filename', 'document.md')
     buffer = io.BytesIO()
     buffer.write(markdown_content.encode('utf-8'))