OxbridgeEconomics commited on
Commit
24d2e54
·
1 Parent(s): 9176677
Files changed (3) hide show
  1. mof.py +3 -4
  2. ndrc.py +10 -16
  3. requirements.txt +1 -1
mof.py CHANGED
@@ -76,11 +76,10 @@ def extract_from_pdf(url):
76
  from datetime import datetime, timedelta
77
  from decimal import Decimal
78
  import boto3
 
79
 
80
- AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
81
- AWS_SECRET_ACCESS_KEY = "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"
82
-
83
- print(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
84
 
85
  def get_db_connection():
86
  """Get dynamoDB connection"""
 
76
  from datetime import datetime, timedelta
77
  from decimal import Decimal
78
  import boto3
79
+ import os
80
 
81
+ AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
82
+ AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
 
 
83
 
84
  def get_db_connection():
85
  """Get dynamoDB connection"""
ndrc.py CHANGED
@@ -2,12 +2,19 @@ import requests
2
  import uuid
3
  import time
4
  import urllib.request
 
 
 
 
5
  from lxml import etree
6
  from googletrans import Translator
7
  from transformers import pipeline
8
  from PyPDF2 import PdfReader
9
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
10
 
 
 
 
11
  translator = Translator()
12
 
13
  def datemodifier(date_string):
@@ -79,17 +86,6 @@ def extract_from_pdf(url):
79
  extracted_text += text
80
  return extracted_text, extracted_text_eng
81
 
82
- """Upload file to dynamoDB"""
83
- # import datetime
84
- from datetime import datetime, timedelta
85
- from decimal import Decimal
86
- import boto3
87
-
88
- AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
89
- AWS_SECRET_ACCESS_KEY = "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"
90
-
91
- print(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
92
-
93
  def get_db_connection():
94
  """Get dynamoDB connection"""
95
  dynamodb = boto3.resource(
@@ -109,9 +105,9 @@ def upsert_content(report):
109
  'id': str(report['id']),
110
  'site': report['site'],
111
  'title': report['title'],
112
- 'originalSite': report['originalSite'],
113
- 'originalTitle': report['originalTitle'],
114
- 'originalContent': report['originalContent'],
115
  'category': report['category'],
116
  # 'author': report['author'],
117
  'content': report['content'],
@@ -126,7 +122,6 @@ def upsert_content(report):
126
  response = table.put_item(Item=item)
127
  print(response)
128
 
129
- reportList = []
130
  categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/zcfb/fzggwl/", "https://www.ndrc.gov.cn/xxgk/zcfb/ghxwj/","https://www.ndrc.gov.cn/xxgk/zcfb/ghwb/","https://www.ndrc.gov.cn/xxgk/zcfb/gg/","https://www.ndrc.gov.cn/xxgk/zcfb/tz/","https://www.ndrc.gov.cn/xxgk/zcfb/pifu/","https://www.ndrc.gov.cn/xxgk/zcfb/qt/"]
131
  for categoryu_url in categoryu_urls:
132
  req = urllib.request.urlopen(categoryu_url)
@@ -196,7 +191,6 @@ for categoryu_url in categoryu_urls:
196
  except Exception as error:
197
  print(error)
198
 
199
- reportList = []
200
  categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"]
201
  for categoryu_url in categoryu_urls:
202
  req = urllib.request.urlopen(categoryu_url)
 
2
  import uuid
3
  import time
4
  import urllib.request
5
+ from datetime import datetime, timedelta
6
+ from decimal import Decimal
7
+ import boto3
8
+ import os
9
  from lxml import etree
10
  from googletrans import Translator
11
  from transformers import pipeline
12
  from PyPDF2 import PdfReader
13
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
14
 
15
+ AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
16
+ AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
17
+
18
  translator = Translator()
19
 
20
  def datemodifier(date_string):
 
86
  extracted_text += text
87
  return extracted_text, extracted_text_eng
88
 
 
 
 
 
 
 
 
 
 
 
 
89
  def get_db_connection():
90
  """Get dynamoDB connection"""
91
  dynamodb = boto3.resource(
 
105
  'id': str(report['id']),
106
  'site': report['site'],
107
  'title': report['title'],
108
+ # 'originalSite': report['originalSite'],
109
+ # 'originalTitle': report['originalTitle'],
110
+ # 'originalContent': report['originalContent'],
111
  'category': report['category'],
112
  # 'author': report['author'],
113
  'content': report['content'],
 
122
  response = table.put_item(Item=item)
123
  print(response)
124
 
 
125
  categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/zcfb/fzggwl/", "https://www.ndrc.gov.cn/xxgk/zcfb/ghxwj/","https://www.ndrc.gov.cn/xxgk/zcfb/ghwb/","https://www.ndrc.gov.cn/xxgk/zcfb/gg/","https://www.ndrc.gov.cn/xxgk/zcfb/tz/","https://www.ndrc.gov.cn/xxgk/zcfb/pifu/","https://www.ndrc.gov.cn/xxgk/zcfb/qt/"]
126
  for categoryu_url in categoryu_urls:
127
  req = urllib.request.urlopen(categoryu_url)
 
191
  except Exception as error:
192
  print(error)
193
 
 
194
  categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"]
195
  for categoryu_url in categoryu_urls:
196
  req = urllib.request.urlopen(categoryu_url)
requirements.txt CHANGED
@@ -21,4 +21,4 @@ s3transfer==0.10.0
21
  six==1.16.0
22
  sniffio==1.3.1
23
  urllib3==2.0.7
24
- PyPDF2
 
21
  six==1.16.0
22
  sniffio==1.3.1
23
  urllib3==2.0.7
24
+ PyPDF2==3.0.1