Muhammad Abdur Rahman Saad
commited on
Commit
·
ab755b3
1
Parent(s):
236ef33
creation of flask api
Browse files- .gitignore +2 -1
- app.py +40 -0
- controllers/utils.py +2 -2
- requirements.txt +3 -1
- routes.py +14 -0
- source/eastmoney.py +1 -1
.gitignore
CHANGED
@@ -5,4 +5,5 @@ venv
|
|
5 |
__pycache__
|
6 |
downloaded_file.pdf
|
7 |
downloaded_file.docx
|
8 |
-
downloaded_file.doc
|
|
|
|
5 |
__pycache__
|
6 |
downloaded_file.pdf
|
7 |
downloaded_file.docx
|
8 |
+
downloaded_file.doc
|
9 |
+
.env
|
app.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask
|
2 |
+
from flask_apscheduler import APScheduler
|
3 |
+
from routes import bp
|
4 |
+
from main import main as data_collection_main
|
5 |
+
|
6 |
+
class Config:
|
7 |
+
SCHEDULER_API_ENABLED = True # Enables the scheduler's API (optional, can be used for debugging)
|
8 |
+
|
9 |
+
def create_app():
|
10 |
+
# Create the Flask application
|
11 |
+
app = Flask(__name__)
|
12 |
+
|
13 |
+
# Load config for APScheduler
|
14 |
+
app.config.from_object(Config())
|
15 |
+
|
16 |
+
# Register our Blueprint (imported from routes.py)
|
17 |
+
app.register_blueprint(bp)
|
18 |
+
|
19 |
+
# Initialize the APScheduler
|
20 |
+
scheduler = APScheduler()
|
21 |
+
scheduler.init_app(app)
|
22 |
+
|
23 |
+
# Schedule a cron job to run main() every day at 16:00
|
24 |
+
@scheduler.task('cron', id='daily_data_collection', hour=16, minute=0)
|
25 |
+
def scheduled_data_collection():
|
26 |
+
"""
|
27 |
+
This function runs automatically every day at 16:00 server time.
|
28 |
+
It calls the Prefect flow defined in 'main.py' to collect data.
|
29 |
+
"""
|
30 |
+
data_collection_main()
|
31 |
+
|
32 |
+
# Start the scheduler
|
33 |
+
scheduler.start()
|
34 |
+
|
35 |
+
return app
|
36 |
+
|
37 |
+
if __name__ == '__main__':
|
38 |
+
# Create the Flask app and run it
|
39 |
+
app = create_app()
|
40 |
+
app.run(debug=True, port=5000)
|
controllers/utils.py
CHANGED
@@ -719,8 +719,8 @@ def crawl_by_url(url, article):
|
|
719 |
article['titleCN'] + article['publishDate'])
|
720 |
logging.info("%s - %s", article['id'], article['site'])
|
721 |
article['referenceid'] = None
|
722 |
-
update_content(article)
|
723 |
-
vectorize(article)
|
724 |
# openai_vectorize(article)
|
725 |
|
726 |
data = download_files_from_s3('data')
|
|
|
719 |
article['titleCN'] + article['publishDate'])
|
720 |
logging.info("%s - %s", article['id'], article['site'])
|
721 |
article['referenceid'] = None
|
722 |
+
# update_content(article)
|
723 |
+
# vectorize(article)
|
724 |
# openai_vectorize(article)
|
725 |
|
726 |
data = download_files_from_s3('data')
|
requirements.txt
CHANGED
@@ -195,4 +195,6 @@ Werkzeug==3.0.3
|
|
195 |
wrapt==1.16.0
|
196 |
yarl==1.9.4
|
197 |
prefect==2.20.2
|
198 |
-
pycryptodome==3.21.0
|
|
|
|
|
|
195 |
wrapt==1.16.0
|
196 |
yarl==1.9.4
|
197 |
prefect==2.20.2
|
198 |
+
pycryptodome==3.21.0
|
199 |
+
flask
|
200 |
+
flask_apscheduler
|
routes.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Blueprint, jsonify
|
2 |
+
from main import main as data_collection_main
|
3 |
+
|
4 |
+
bp = Blueprint('data_routes', __name__)
|
5 |
+
|
6 |
+
@bp.route('/trigger-data-collection', methods=['GET'])
|
7 |
+
def trigger_data_collection():
|
8 |
+
"""
|
9 |
+
This endpoint manually triggers the data collection process defined in main.py.
|
10 |
+
|
11 |
+
HTTP Method: GET (you can switch to POST if you prefer).
|
12 |
+
"""
|
13 |
+
data_collection_main() # This calls the Prefect flow that orchestrates data collection
|
14 |
+
return jsonify({"message": "Data collection triggered successfully"}), 200
|
source/eastmoney.py
CHANGED
@@ -96,7 +96,7 @@ def _crawl(url, article, retries=3):
|
|
96 |
reference_id = extract_reference(article)
|
97 |
if reference_id:
|
98 |
article['referenceid'] = reference_id
|
99 |
-
update_content(article)
|
100 |
vectorize(article)
|
101 |
# openai_vectorize(article)
|
102 |
|
|
|
96 |
reference_id = extract_reference(article)
|
97 |
if reference_id:
|
98 |
article['referenceid'] = reference_id
|
99 |
+
# update_content(article)
|
100 |
vectorize(article)
|
101 |
# openai_vectorize(article)
|
102 |
|