OxbridgeEconomics
commited on
Commit
·
24d2e54
1
Parent(s):
9176677
commit
Browse files- mof.py +3 -4
- ndrc.py +10 -16
- requirements.txt +1 -1
mof.py
CHANGED
@@ -76,11 +76,10 @@ def extract_from_pdf(url):
|
|
76 |
from datetime import datetime, timedelta
|
77 |
from decimal import Decimal
|
78 |
import boto3
|
|
|
79 |
|
80 |
-
AWS_ACCESS_KEY_ID =
|
81 |
-
AWS_SECRET_ACCESS_KEY =
|
82 |
-
|
83 |
-
print(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
84 |
|
85 |
def get_db_connection():
|
86 |
"""Get dynamoDB connection"""
|
|
|
76 |
from datetime import datetime, timedelta
|
77 |
from decimal import Decimal
|
78 |
import boto3
|
79 |
+
import os
|
80 |
|
81 |
+
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
82 |
+
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
|
|
|
|
83 |
|
84 |
def get_db_connection():
|
85 |
"""Get dynamoDB connection"""
|
ndrc.py
CHANGED
@@ -2,12 +2,19 @@ import requests
|
|
2 |
import uuid
|
3 |
import time
|
4 |
import urllib.request
|
|
|
|
|
|
|
|
|
5 |
from lxml import etree
|
6 |
from googletrans import Translator
|
7 |
from transformers import pipeline
|
8 |
from PyPDF2 import PdfReader
|
9 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
10 |
|
|
|
|
|
|
|
11 |
translator = Translator()
|
12 |
|
13 |
def datemodifier(date_string):
|
@@ -79,17 +86,6 @@ def extract_from_pdf(url):
|
|
79 |
extracted_text += text
|
80 |
return extracted_text, extracted_text_eng
|
81 |
|
82 |
-
"""Upload file to dynamoDB"""
|
83 |
-
# import datetime
|
84 |
-
from datetime import datetime, timedelta
|
85 |
-
from decimal import Decimal
|
86 |
-
import boto3
|
87 |
-
|
88 |
-
AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
|
89 |
-
AWS_SECRET_ACCESS_KEY = "D2A0IEVl5g3Ljbu0Y5iq9WuFETpDeoEpl69C+6xo"
|
90 |
-
|
91 |
-
print(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
92 |
-
|
93 |
def get_db_connection():
|
94 |
"""Get dynamoDB connection"""
|
95 |
dynamodb = boto3.resource(
|
@@ -109,9 +105,9 @@ def upsert_content(report):
|
|
109 |
'id': str(report['id']),
|
110 |
'site': report['site'],
|
111 |
'title': report['title'],
|
112 |
-
'originalSite': report['originalSite'],
|
113 |
-
'originalTitle': report['originalTitle'],
|
114 |
-
'originalContent': report['originalContent'],
|
115 |
'category': report['category'],
|
116 |
# 'author': report['author'],
|
117 |
'content': report['content'],
|
@@ -126,7 +122,6 @@ def upsert_content(report):
|
|
126 |
response = table.put_item(Item=item)
|
127 |
print(response)
|
128 |
|
129 |
-
reportList = []
|
130 |
categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/zcfb/fzggwl/", "https://www.ndrc.gov.cn/xxgk/zcfb/ghxwj/","https://www.ndrc.gov.cn/xxgk/zcfb/ghwb/","https://www.ndrc.gov.cn/xxgk/zcfb/gg/","https://www.ndrc.gov.cn/xxgk/zcfb/tz/","https://www.ndrc.gov.cn/xxgk/zcfb/pifu/","https://www.ndrc.gov.cn/xxgk/zcfb/qt/"]
|
131 |
for categoryu_url in categoryu_urls:
|
132 |
req = urllib.request.urlopen(categoryu_url)
|
@@ -196,7 +191,6 @@ for categoryu_url in categoryu_urls:
|
|
196 |
except Exception as error:
|
197 |
print(error)
|
198 |
|
199 |
-
reportList = []
|
200 |
categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"]
|
201 |
for categoryu_url in categoryu_urls:
|
202 |
req = urllib.request.urlopen(categoryu_url)
|
|
|
2 |
import uuid
|
3 |
import time
|
4 |
import urllib.request
|
5 |
+
from datetime import datetime, timedelta
|
6 |
+
from decimal import Decimal
|
7 |
+
import boto3
|
8 |
+
import os
|
9 |
from lxml import etree
|
10 |
from googletrans import Translator
|
11 |
from transformers import pipeline
|
12 |
from PyPDF2 import PdfReader
|
13 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
14 |
|
15 |
+
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
16 |
+
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
17 |
+
|
18 |
translator = Translator()
|
19 |
|
20 |
def datemodifier(date_string):
|
|
|
86 |
extracted_text += text
|
87 |
return extracted_text, extracted_text_eng
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
def get_db_connection():
|
90 |
"""Get dynamoDB connection"""
|
91 |
dynamodb = boto3.resource(
|
|
|
105 |
'id': str(report['id']),
|
106 |
'site': report['site'],
|
107 |
'title': report['title'],
|
108 |
+
# 'originalSite': report['originalSite'],
|
109 |
+
# 'originalTitle': report['originalTitle'],
|
110 |
+
# 'originalContent': report['originalContent'],
|
111 |
'category': report['category'],
|
112 |
# 'author': report['author'],
|
113 |
'content': report['content'],
|
|
|
122 |
response = table.put_item(Item=item)
|
123 |
print(response)
|
124 |
|
|
|
125 |
categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/zcfb/fzggwl/", "https://www.ndrc.gov.cn/xxgk/zcfb/ghxwj/","https://www.ndrc.gov.cn/xxgk/zcfb/ghwb/","https://www.ndrc.gov.cn/xxgk/zcfb/gg/","https://www.ndrc.gov.cn/xxgk/zcfb/tz/","https://www.ndrc.gov.cn/xxgk/zcfb/pifu/","https://www.ndrc.gov.cn/xxgk/zcfb/qt/"]
|
126 |
for categoryu_url in categoryu_urls:
|
127 |
req = urllib.request.urlopen(categoryu_url)
|
|
|
191 |
except Exception as error:
|
192 |
print(error)
|
193 |
|
|
|
194 |
categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"]
|
195 |
for categoryu_url in categoryu_urls:
|
196 |
req = urllib.request.urlopen(categoryu_url)
|
requirements.txt
CHANGED
@@ -21,4 +21,4 @@ s3transfer==0.10.0
|
|
21 |
six==1.16.0
|
22 |
sniffio==1.3.1
|
23 |
urllib3==2.0.7
|
24 |
-
PyPDF2
|
|
|
21 |
six==1.16.0
|
22 |
sniffio==1.3.1
|
23 |
urllib3==2.0.7
|
24 |
+
PyPDF2==3.0.1
|