Spaces:

mike23415
/

Reserch

Runtime error

App Files Files Community

mike23415 commited on 18 days ago

Commit

f67e43c

verified ·

1 Parent(s): f9dfaf0

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -23

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import tempfile
 import jinja2
 import pdfkit
 import torch
 from threading import Thread
 from flask import Flask, request, send_file, jsonify
 from flask_cors import CORS
@@ -13,7 +14,13 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 os.environ['HF_HOME'] = '/app/.cache'
 os.environ['XDG_CACHE_HOME'] = '/app/.cache'
 app = Flask(__name__)
 CORS(app)
@@ -22,11 +29,20 @@ model_loaded = False
 load_error = None
 generator = None
 def load_model():
     global model_loaded, load_error, generator
     try:
         # Detect device and dtype automatically
         dtype = torch.float16 if torch.cuda.is_available() else torch.float32
         model = AutoModelForCausalLM.from_pretrained(
             "gpt2-medium",
@@ -47,17 +63,15 @@ def load_model():
         )
         model_loaded = True
-        print(f"Model loaded on {model.device}")
     except Exception as e:
         load_error = str(e)
-        print(f"Model loading failed: {load_error}")
 # Start model loading in background thread
 Thread(target=load_model).start()
 # --------------------------------------------------
 # IEEE Format Template
 # --------------------------------------------------
@@ -102,14 +116,12 @@ IEEE_TEMPLATE = """
         {{ abstract }}
         <div class="keywords">Keywords— {{ keywords }}</div>
     </div>
     <div class="two-column">
         {% for section in sections %}
         <h2>{{ section.title }}</h2>
         {{ section.content }}
         {% endfor %}
     </div>
     <div class="references">
         <h2>References</h2>
         {% for ref in references %}
@@ -125,42 +137,58 @@ IEEE_TEMPLATE = """
 # --------------------------------------------------
 @app.route('/health', methods=['GET'])
 def health_check():
     if load_error:
         return jsonify({
             "status": "error",
             "message": f"Model failed to load: {load_error}"
         }), 500
     return jsonify({
         "status": "ready" if model_loaded else "loading",
         "model_loaded": model_loaded,
-        "device": "cuda" if torch.cuda.is_available() else "cpu"
-    }), 200 if model_loaded else 503
 @app.route('/generate', methods=['POST'])
 def generate_pdf():
     # Check model status
     if not model_loaded:
         return jsonify({
             "error": "Model not loaded yet",
             "status": "loading"
         }), 503
     try:
         # Validate input
         data = request.json
         if not data:
             return jsonify({"error": "No data provided"}), 400
         required = ['title', 'authors', 'content']
         if missing := [field for field in required if field not in data]:
             return jsonify({
                 "error": f"Missing fields: {', '.join(missing)}"
             }), 400
-        # Format content
         formatted = format_content(data['content'])
         # Generate HTML
         html = jinja2.Template(IEEE_TEMPLATE).render(
             title=data['title'],
@@ -183,33 +211,178 @@ def generate_pdf():
         }
         # Create temporary PDF
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as f:
-            pdfkit.from_string(html, f.name, options=options)
-            return send_file(f.name, mimetype='application/pdf')
     except Exception as e:
         return jsonify({"error": str(e)}), 500
     finally:
-        if 'f' in locals():
-            try: os.remove(f.name)
-            except: pass
 # --------------------------------------------------
 # Content Formatting
 # --------------------------------------------------
 def format_content(content):
     try:
-        prompt = f"Format this research content to IEEE standards:\n{str(content)}"
-        return generator(
             prompt,
-            max_new_tokens=512,
-            temperature=0.7,
             do_sample=True,
-            truncation=True
-        )[0]['generated_text']
     except Exception as e:
-        print(f"Formatting error: {str(e)}")
-        return content
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)

 import jinja2
 import pdfkit
 import torch
+import logging
 from threading import Thread
 from flask import Flask, request, send_file, jsonify
 from flask_cors import CORS
 os.environ['HF_HOME'] = '/app/.cache'
 os.environ['XDG_CACHE_HOME'] = '/app/.cache'
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(levelname)s] %(message)s'
+)
+# Initialize Flask app
 app = Flask(__name__)
 CORS(app)
 load_error = None
 generator = None
+# Configure wkhtmltopdf
+# Use xvfb-run for headless PDF generation
+WKHTMLTOPDF_CMD = 'xvfb-run -a wkhtmltopdf'
+pdf_config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_CMD)
 def load_model():
     global model_loaded, load_error, generator
     try:
+        app.logger.info("Starting model loading process")
         # Detect device and dtype automatically
         dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        app.logger.info(f"Device set to use {device}")
         model = AutoModelForCausalLM.from_pretrained(
             "gpt2-medium",
         )
         model_loaded = True
+        app.logger.info(f"Model loaded successfully on {model.device}")
     except Exception as e:
         load_error = str(e)
+        app.logger.error(f"Model loading failed: {load_error}", exc_info=True)
 # Start model loading in background thread
 Thread(target=load_model).start()
 # --------------------------------------------------
 # IEEE Format Template
 # --------------------------------------------------
         {{ abstract }}
         <div class="keywords">Keywords— {{ keywords }}</div>
     </div>
     <div class="two-column">
         {% for section in sections %}
         <h2>{{ section.title }}</h2>
         {{ section.content }}
         {% endfor %}
     </div>
     <div class="references">
         <h2>References</h2>
         {% for ref in references %}
 # --------------------------------------------------
 @app.route('/health', methods=['GET'])
 def health_check():
+    app.logger.info("Health check requested")
     if load_error:
+        app.logger.error(f"Health check failed: {load_error}")
         return jsonify({
             "status": "error",
             "message": f"Model failed to load: {load_error}"
         }), 500
+    status_code = 200 if model_loaded else 503
+    device_info = "cuda" if torch.cuda.is_available() else "cpu"
+    app.logger.info(f"Health check returning status: {'ready' if model_loaded else 'loading'}, device: {device_info}")
     return jsonify({
         "status": "ready" if model_loaded else "loading",
         "model_loaded": model_loaded,
+        "device": device_info
+    }), status_code
 @app.route('/generate', methods=['POST'])
 def generate_pdf():
     # Check model status
     if not model_loaded:
+        app.logger.error("PDF generation requested but model not loaded")
         return jsonify({
             "error": "Model not loaded yet",
             "status": "loading"
         }), 503
     try:
+        app.logger.info("Processing PDF generation request")
         # Validate input
         data = request.json
         if not data:
+            app.logger.error("No data provided in request")
             return jsonify({"error": "No data provided"}), 400
         required = ['title', 'authors', 'content']
         if missing := [field for field in required if field not in data]:
+            app.logger.error(f"Missing required fields: {missing}")
             return jsonify({
                 "error": f"Missing fields: {', '.join(missing)}"
             }), 400
+        app.logger.info(f"Received request with title: {data['title']}")
+        # Format content with model
+        app.logger.info("Formatting content using the model")
         formatted = format_content(data['content'])
+        app.logger.info("Creating HTML from template")
         # Generate HTML
         html = jinja2.Template(IEEE_TEMPLATE).render(
             title=data['title'],
         }
         # Create temporary PDF
+        app.logger.info("Generating PDF file")
+        pdf_path = None
+        try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as f:
+                pdf_path = f.name
+            # Generate PDF using wkhtmltopdf with xvfb
+            pdfkit.from_string(html, pdf_path, options=options, configuration=pdf_config)
+            app.logger.info(f"PDF generated successfully at {pdf_path}")
+            return send_file(pdf_path, mimetype='application/pdf', as_attachment=True,
+                             download_name=f"{data['title'].replace(' ', '_')}.pdf")
+        except Exception as e:
+            app.logger.error(f"PDF generation failed: {str(e)}", exc_info=True)
+            raise
     except Exception as e:
+        app.logger.error(f"Request processing failed: {str(e)}", exc_info=True)
         return jsonify({"error": str(e)}), 500
     finally:
+        # Clean up temporary file
+        if 'pdf_path' in locals() and pdf_path:
+            try:
+                app.logger.info(f"Cleaning up temporary file {pdf_path}")
+                os.remove(pdf_path)
+            except Exception as e:
+                app.logger.warning(f"Failed to remove temporary file: {str(e)}")
 # --------------------------------------------------
 # Content Formatting
 # --------------------------------------------------
+def parse_formatted_content(text):
+    """Parse the generated text into structured sections"""
+    app.logger.info("Parsing formatted content")
+    try:
+        lines = text.split('\n')
+        # Default structure
+        result = {
+            'abstract': '',
+            'keywords': ['IEEE', 'format', 'research', 'paper'],
+            'sections': [],
+            'references': []
+        }
+        # Extract abstract (simple approach - first paragraph after "Abstract")
+        abstract_start = None
+        for i, line in enumerate(lines):
+            if line.strip().lower() == 'abstract':
+                abstract_start = i + 1
+                break
+        if abstract_start:
+            abstract_text = []
+            i = abstract_start
+            while i < len(lines) and not lines[i].strip().lower().startswith('keyword'):
+                if lines[i].strip():
+                    abstract_text.append(lines[i].strip())
+                i += 1
+            result['abstract'] = ' '.join(abstract_text)
+        # Extract keywords
+        for line in lines:
+            if line.strip().lower().startswith('keyword'):
+                # Extract keywords from the line
+                keyword_parts = line.split('—')
+                if len(keyword_parts) > 1:
+                    keywords = keyword_parts[1].strip().split(',')
+                    result['keywords'] = [k.strip() for k in keywords if k.strip()]
+                break
+        # Extract sections
+        current_section = None
+        section_content = []
+        # Skip lines until we find a section heading
+        started = False
+        for line in lines:
+            # Very basic heuristic for Roman numerals section headings
+            if line.strip() and (line.strip()[0].isupper() or line.strip()[0].isdigit()):
+                started = True
+            if not started:
+                continue
+            if line.strip() and (line.strip()[0].isupper() or line.strip()[0].isdigit()) and len(line.strip().split()) <= 6:
+                # This is likely a section heading
+                if current_section:
+                    # Save the previous section
+                    result['sections'].append({
+                        'title': current_section,
+                        'content': '\n'.join(section_content)
+                    })
+                    section_content = []
+                current_section = line.strip()
+            elif current_section and line.strip().lower() == 'references':
+                # We've reached the references section
+                if current_section:
+                    # Save the last section
+                    result['sections'].append({
+                        'title': current_section,
+                        'content': '\n'.join(section_content)
+                    })
+                break
+            elif current_section:
+                # Add to current section content
+                section_content.append(line)
+        # Extract references
+        in_references = False
+        for line in lines:
+            if line.strip().lower() == 'references':
+                in_references = True
+                continue
+            if in_references and line.strip():
+                result['references'].append(line.strip())
+        app.logger.info(f"Content parsed into {len(result['sections'])} sections and {len(result['references'])} references")
+        return result
+    except Exception as e:
+        app.logger.error(f"Error parsing formatted content: {str(e)}", exc_info=True)
+        # Return a basic structure if parsing fails
+        return {
+            'abstract': 'Error parsing content.',
+            'keywords': ['IEEE', 'format'],
+            'sections': [{'title': 'Content', 'content': text}],
+            'references': []
+        }
 def format_content(content):
+    """Format the content using the ML model"""
     try:
+        app.logger.info("Formatting content with ML model")
+        prompt = f"Format this research content to IEEE standards with sections, abstract, and references:\n\n{str(content)}"
+        response = generator(
             prompt,
+            max_new_tokens=1024,  # Increased for more complete generation
+            temperature=0.5,      # More deterministic output
             do_sample=True,
+            truncation=True,
+            num_return_sequences=1
+        )
+        generated_text = response[0]['generated_text']
+        # Remove the prompt from the generated text
+        if prompt in generated_text:
+            formatted_text = generated_text[len(prompt):].strip()
+        else:
+            formatted_text = generated_text
+        app.logger.info("Content formatted successfully")
+        # Parse the formatted text into structured sections
+        return parse_formatted_content(formatted_text)
     except Exception as e:
+        app.logger.error(f"Error formatting content: {str(e)}", exc_info=True)
+        # Return the original content if formatting fails
+        return {
+            'abstract': 'Content processing error.',
+            'keywords': ['IEEE', 'format'],
+            'sections': [{'title': 'Content', 'content': str(content)}],
+            'references': []
+        }
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)