Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

Muhammad Abdur Rahman Saad commited on Mar 5

Commit

ab755b3

1 Parent(s): 236ef33

creation of flask api

Files changed (6) hide show

.gitignore CHANGED Viewed

@@ -5,4 +5,5 @@ venv
 __pycache__
 downloaded_file.pdf
 downloaded_file.docx
-downloaded_file.doc

 __pycache__
 downloaded_file.pdf
 downloaded_file.docx
+downloaded_file.doc
+.env

app.py ADDED Viewed

+from flask import Flask
+from flask_apscheduler import APScheduler
+from routes import bp
+from main import main as data_collection_main
+class Config:
+    SCHEDULER_API_ENABLED = True  # Enables the scheduler's API (optional, can be used for debugging)
+def create_app():
+    # Create the Flask application
+    app = Flask(__name__)
+    # Load config for APScheduler
+    app.config.from_object(Config())
+    # Register our Blueprint (imported from routes.py)
+    app.register_blueprint(bp)
+    # Initialize the APScheduler
+    scheduler = APScheduler()
+    scheduler.init_app(app)
+    # Schedule a cron job to run main() every day at 16:00
+    @scheduler.task('cron', id='daily_data_collection', hour=16, minute=0)
+    def scheduled_data_collection():
+        """
+        This function runs automatically every day at 16:00 server time.
+        It calls the Prefect flow defined in 'main.py' to collect data.
+        """
+        data_collection_main()
+    # Start the scheduler
+    scheduler.start()
+    return app
+if __name__ == '__main__':
+    # Create the Flask app and run it
+    app = create_app()
+    app.run(debug=True, port=5000)

controllers/utils.py CHANGED Viewed

@@ -719,8 +719,8 @@ def crawl_by_url(url, article):
                                article['titleCN'] + article['publishDate'])
     logging.info("%s - %s", article['id'], article['site'])
     article['referenceid'] = None
-    update_content(article)
-    vectorize(article)
     # openai_vectorize(article)
 data = download_files_from_s3('data')

                                article['titleCN'] + article['publishDate'])
     logging.info("%s - %s", article['id'], article['site'])
     article['referenceid'] = None
+    # update_content(article)
+    # vectorize(article)
     # openai_vectorize(article)
 data = download_files_from_s3('data')

requirements.txt CHANGED Viewed

@@ -195,4 +195,6 @@ Werkzeug==3.0.3
 wrapt==1.16.0
 yarl==1.9.4
 prefect==2.20.2
-pycryptodome==3.21.0

 wrapt==1.16.0
 yarl==1.9.4
 prefect==2.20.2
+pycryptodome==3.21.0
+flask
+flask_apscheduler

routes.py ADDED Viewed

+from flask import Blueprint, jsonify
+from main import main as data_collection_main
+bp = Blueprint('data_routes', __name__)
+@bp.route('/trigger-data-collection', methods=['GET'])
+def trigger_data_collection():
+    """
+    This endpoint manually triggers the data collection process defined in main.py.
+    HTTP Method: GET (you can switch to POST if you prefer).
+    """
+    data_collection_main()  # This calls the Prefect flow that orchestrates data collection
+    return jsonify({"message": "Data collection triggered successfully"}), 200

source/eastmoney.py CHANGED Viewed

@@ -96,7 +96,7 @@ def _crawl(url, article, retries=3):
     reference_id = extract_reference(article)
     if reference_id:
         article['referenceid'] = reference_id
-    update_content(article)
     vectorize(article)
     # openai_vectorize(article)

     reference_id = extract_reference(article)
     if reference_id:
         article['referenceid'] = reference_id
+    # update_content(article)
     vectorize(article)
     # openai_vectorize(article)