Muhammad Abdur Rahman Saad commited on
Commit
ab755b3
·
1 Parent(s): 236ef33

creation of flask api

Browse files
Files changed (6) hide show
  1. .gitignore +2 -1
  2. app.py +40 -0
  3. controllers/utils.py +2 -2
  4. requirements.txt +3 -1
  5. routes.py +14 -0
  6. source/eastmoney.py +1 -1
.gitignore CHANGED
@@ -5,4 +5,5 @@ venv
5
  __pycache__
6
  downloaded_file.pdf
7
  downloaded_file.docx
8
- downloaded_file.doc
 
 
5
  __pycache__
6
  downloaded_file.pdf
7
  downloaded_file.docx
8
+ downloaded_file.doc
9
+ .env
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask
2
+ from flask_apscheduler import APScheduler
3
+ from routes import bp
4
+ from main import main as data_collection_main
5
+
6
+ class Config:
7
+ SCHEDULER_API_ENABLED = True # Enables the scheduler's API (optional, can be used for debugging)
8
+
9
+ def create_app():
10
+ # Create the Flask application
11
+ app = Flask(__name__)
12
+
13
+ # Load config for APScheduler
14
+ app.config.from_object(Config())
15
+
16
+ # Register our Blueprint (imported from routes.py)
17
+ app.register_blueprint(bp)
18
+
19
+ # Initialize the APScheduler
20
+ scheduler = APScheduler()
21
+ scheduler.init_app(app)
22
+
23
+ # Schedule a cron job to run main() every day at 16:00
24
+ @scheduler.task('cron', id='daily_data_collection', hour=16, minute=0)
25
+ def scheduled_data_collection():
26
+ """
27
+ This function runs automatically every day at 16:00 server time.
28
+ It calls the Prefect flow defined in 'main.py' to collect data.
29
+ """
30
+ data_collection_main()
31
+
32
+ # Start the scheduler
33
+ scheduler.start()
34
+
35
+ return app
36
+
37
+ if __name__ == '__main__':
38
+ # Create the Flask app and run it
39
+ app = create_app()
40
+ app.run(debug=True, port=5000)
controllers/utils.py CHANGED
@@ -719,8 +719,8 @@ def crawl_by_url(url, article):
719
  article['titleCN'] + article['publishDate'])
720
  logging.info("%s - %s", article['id'], article['site'])
721
  article['referenceid'] = None
722
- update_content(article)
723
- vectorize(article)
724
  # openai_vectorize(article)
725
 
726
  data = download_files_from_s3('data')
 
719
  article['titleCN'] + article['publishDate'])
720
  logging.info("%s - %s", article['id'], article['site'])
721
  article['referenceid'] = None
722
+ # update_content(article)
723
+ # vectorize(article)
724
  # openai_vectorize(article)
725
 
726
  data = download_files_from_s3('data')
requirements.txt CHANGED
@@ -195,4 +195,6 @@ Werkzeug==3.0.3
195
  wrapt==1.16.0
196
  yarl==1.9.4
197
  prefect==2.20.2
198
- pycryptodome==3.21.0
 
 
 
195
  wrapt==1.16.0
196
  yarl==1.9.4
197
  prefect==2.20.2
198
+ pycryptodome==3.21.0
199
+ flask
200
+ flask_apscheduler
routes.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Blueprint, jsonify
2
+ from main import main as data_collection_main
3
+
4
+ bp = Blueprint('data_routes', __name__)
5
+
6
+ @bp.route('/trigger-data-collection', methods=['GET'])
7
+ def trigger_data_collection():
8
+ """
9
+ This endpoint manually triggers the data collection process defined in main.py.
10
+
11
+ HTTP Method: GET (you can switch to POST if you prefer).
12
+ """
13
+ data_collection_main() # This calls the Prefect flow that orchestrates data collection
14
+ return jsonify({"message": "Data collection triggered successfully"}), 200
source/eastmoney.py CHANGED
@@ -96,7 +96,7 @@ def _crawl(url, article, retries=3):
96
  reference_id = extract_reference(article)
97
  if reference_id:
98
  article['referenceid'] = reference_id
99
- update_content(article)
100
  vectorize(article)
101
  # openai_vectorize(article)
102
 
 
96
  reference_id = extract_reference(article)
97
  if reference_id:
98
  article['referenceid'] = reference_id
99
+ # update_content(article)
100
  vectorize(article)
101
  # openai_vectorize(article)
102