OxbridgeEconomics
commited on
Commit
Ā·
b6dcee5
1
Parent(s):
b2a3d45
commit
Browse files- cbirc.py +7 -7
- eastmoney.py +38 -32
- gov.py +36 -41
- utils.py +49 -17
- xpath.json +19 -0
cbirc.py
CHANGED
@@ -19,17 +19,17 @@ while i > -1:
|
|
19 |
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
20 |
i = -1
|
21 |
else:
|
22 |
-
article['
|
23 |
-
if len(article['
|
24 |
continue
|
25 |
CONTENT_ENG = ''
|
26 |
-
for element in article['
|
27 |
-
CONTENT_ENG += translate(element) + '
|
28 |
article['content'] = CONTENT_ENG
|
29 |
article['site'] = "National Financial Regulatory Administration of China"
|
30 |
-
article['
|
31 |
-
article['
|
32 |
-
article['title'] = translate(article['
|
33 |
article['url'] = "https://www.cbirc.gov.cn" + article['pdfFileUrl']
|
34 |
article['category']= "Policy Interpretation"
|
35 |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
|
|
|
19 |
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
20 |
i = -1
|
21 |
else:
|
22 |
+
article['originContent'] = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
|
23 |
+
if len(article['originContent']) < 10:
|
24 |
continue
|
25 |
CONTENT_ENG = ''
|
26 |
+
for element in article['originContent'].split("\n"):
|
27 |
+
CONTENT_ENG += translate(element) + '\n'
|
28 |
article['content'] = CONTENT_ENG
|
29 |
article['site'] = "National Financial Regulatory Administration of China"
|
30 |
+
article['originSite'] = "å½å®¶éčēē£ē®”ēę»å±"
|
31 |
+
article['originTitle'] = article['docSubtitle']
|
32 |
+
article['title'] = translate(article['originTitle'])
|
33 |
article['url'] = "https://www.cbirc.gov.cn" + article['pdfFileUrl']
|
34 |
article['category']= "Policy Interpretation"
|
35 |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
|
eastmoney.py
CHANGED
@@ -1,9 +1,45 @@
|
|
1 |
import uuid
|
2 |
import json
|
3 |
import urllib.request
|
|
|
4 |
from datetime import datetime, timedelta
|
5 |
from lxml import etree
|
6 |
-
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
today = datetime.today().strftime('%Y-%m-%d')
|
9 |
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
|
@@ -33,37 +69,7 @@ while i > -1:
|
|
33 |
for article in reportinfo['data']:
|
34 |
try:
|
35 |
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
|
36 |
-
|
37 |
-
text = req.read()
|
38 |
-
html_text = text.decode("utf-8")
|
39 |
-
page = etree.HTML(html_text)
|
40 |
-
content = encode(page.xpath("//div[contains(@class, 'ctx-content')]//p"))
|
41 |
-
reporturl = encode(page.xpath("//a[contains(@class, 'pdf-link')]/@href"))
|
42 |
-
article['url'] = url
|
43 |
-
if article['orgSName'] == "''":
|
44 |
-
article['site'] = translate(article['orgSName'])
|
45 |
-
article['originalSite'] = article['orgSName']
|
46 |
-
else:
|
47 |
-
article['site'] = translate(article['orgName'])
|
48 |
-
article['originalSite'] = article['orgSName']
|
49 |
-
article['reporturl'] = reporturl
|
50 |
-
article['originalTitle'] = article['title']
|
51 |
-
article['title'] = translate(article['title'])
|
52 |
-
article['author'] = translate(article['researcher'])
|
53 |
-
article['originalAuthor'] = article['researcher']
|
54 |
-
article['originalContent'] = content
|
55 |
-
article['category'] = "Macroeconomic Research"
|
56 |
-
if len(article['originalContent']) < 10:
|
57 |
-
continue
|
58 |
-
CONTENT_ENG = ''
|
59 |
-
for element in article['originalContent'].split("ć"):
|
60 |
-
CONTENT_ENG += translate(element) + ' '
|
61 |
-
article['content'] = CONTENT_ENG
|
62 |
-
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
|
63 |
-
article['publishDate'] = datemodifier(article['publishDate'], "%Y-%m-%d %H:%M:%S.%f")
|
64 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
65 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
66 |
-
upsert_content(article)
|
67 |
except Exception as error:
|
68 |
print(error)
|
69 |
else:
|
|
|
1 |
import uuid
|
2 |
import json
|
3 |
import urllib.request
|
4 |
+
from urllib.parse import urlparse
|
5 |
from datetime import datetime, timedelta
|
6 |
from lxml import etree
|
7 |
+
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content
|
8 |
+
|
9 |
+
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
10 |
+
xpath_dict = json.load(f)
|
11 |
+
|
12 |
+
def crawl(url, article):
|
13 |
+
domain = urlparse(url).netloc
|
14 |
+
req = urllib.request.urlopen(url)
|
15 |
+
text = req.read()
|
16 |
+
html_text = text.decode("utf-8")
|
17 |
+
page = etree.HTML(html_text)
|
18 |
+
originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
|
19 |
+
article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
|
20 |
+
article['link'] = url
|
21 |
+
if article['orgSName'] == "''":
|
22 |
+
article['site'] = translate(article['orgSName'])
|
23 |
+
else:
|
24 |
+
article['site'] = translate(article['orgName'])
|
25 |
+
article['originTitle'] = article['title']
|
26 |
+
article['title'] = translate(article['title'])
|
27 |
+
article['author'] = translate(article['researcher'])
|
28 |
+
article['originAuthor'] = article['researcher']
|
29 |
+
article['originContent'] = repr(originContent)
|
30 |
+
article['subtitle'] = translate(summary)
|
31 |
+
article['category'] = "Macroeconomic Research"
|
32 |
+
if len(article['originContent']) < 10:
|
33 |
+
return None
|
34 |
+
CONTENT_ENG = ''
|
35 |
+
for element in originContent.split("\n"):
|
36 |
+
CONTENT_ENG += translate(element) + '\n'
|
37 |
+
article['content'] = repr(CONTENT_ENG)
|
38 |
+
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
|
39 |
+
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime']['format_string'])
|
40 |
+
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
41 |
+
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
|
42 |
+
upsert_content(article)
|
43 |
|
44 |
today = datetime.today().strftime('%Y-%m-%d')
|
45 |
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
|
|
|
69 |
for article in reportinfo['data']:
|
70 |
try:
|
71 |
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
|
72 |
+
crawl(url,article)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
except Exception as error:
|
74 |
print(error)
|
75 |
else:
|
gov.py
CHANGED
@@ -1,9 +1,42 @@
|
|
1 |
from datetime import datetime, timedelta
|
2 |
import uuid
|
3 |
import time
|
|
|
4 |
import urllib.request
|
5 |
from lxml import etree
|
6 |
-
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
i = 0
|
9 |
while i > -1:
|
@@ -32,27 +65,8 @@ while i > -1:
|
|
32 |
article = {}
|
33 |
url = url.replace('../', 'https://www.gov.cn/zhengce/')
|
34 |
if "https://www.gov.cn" in url:
|
35 |
-
req = urllib.request.urlopen(url)
|
36 |
-
text = req.read()
|
37 |
-
html_text = text.decode("utf-8")
|
38 |
-
page = etree.HTML(html_text)
|
39 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
|
40 |
-
if len(article['originalContent']) < 10:
|
41 |
-
continue
|
42 |
-
CONTENT_ENG = ''
|
43 |
-
for element in article['originalContent'].split("ć"):
|
44 |
-
CONTENT_ENG += translate(element) + ' '
|
45 |
-
article['content'] = CONTENT_ENG
|
46 |
-
article['site'] = "State Council of China"
|
47 |
-
article['originalSite'] = "å½å”é¢"
|
48 |
-
article['originalTitle'] = page.xpath("//title/text()")[0]
|
49 |
-
article['title'] = translate(article['originalTitle'])
|
50 |
-
article['url'] = url
|
51 |
article['category']= "Policy Interpretation"
|
52 |
-
|
53 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
54 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
55 |
-
upsert_content(article)
|
56 |
except Exception as error:
|
57 |
print(error)
|
58 |
|
@@ -83,26 +97,7 @@ while i > -1:
|
|
83 |
article = {}
|
84 |
url = url.replace('../', 'https://www.gov.cn/zhengce/')
|
85 |
if "https://www.gov.cn" in url:
|
86 |
-
req = urllib.request.urlopen(url)
|
87 |
-
text = req.read()
|
88 |
-
html_text = text.decode("utf-8")
|
89 |
-
page = etree.HTML(html_text)
|
90 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
|
91 |
-
if len(article['originalContent']) < 10:
|
92 |
-
continue
|
93 |
-
CONTENT_ENG = ''
|
94 |
-
for element in article['originalContent'].split("ć"):
|
95 |
-
CONTENT_ENG += translate(article['originalContent']) + ' '
|
96 |
-
article['content'] = CONTENT_ENG
|
97 |
-
article['site'] = "State Council of China"
|
98 |
-
article['originalSite'] = "å½å”é¢"
|
99 |
-
article['originalTitle'] = page.xpath("//title/text()")[0]
|
100 |
-
article['title'] = translate(article['originalTitle'])
|
101 |
-
article['url'] = url
|
102 |
article['category']= "Policy Release"
|
103 |
-
|
104 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
105 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
106 |
-
upsert_content(article)
|
107 |
except Exception as error:
|
108 |
print(error)
|
|
|
1 |
from datetime import datetime, timedelta
|
2 |
import uuid
|
3 |
import time
|
4 |
+
from urllib.parse import urlparse
|
5 |
import urllib.request
|
6 |
from lxml import etree
|
7 |
+
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, encode_content
|
8 |
+
import json
|
9 |
+
|
10 |
+
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
11 |
+
xpath_dict = json.load(f)
|
12 |
+
|
13 |
+
def crawl(url, article):
|
14 |
+
domain = urlparse(url).netloc
|
15 |
+
req = urllib.request.urlopen(url)
|
16 |
+
text = req.read()
|
17 |
+
html_text = text.decode("utf-8")
|
18 |
+
page = etree.HTML(html_text)
|
19 |
+
originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
|
20 |
+
article['originContent'] = repr(originContent)
|
21 |
+
article['subtitle'] = translate(summary)
|
22 |
+
if len(article['originContent']) < 10:
|
23 |
+
return None
|
24 |
+
CONTENT_ENG = ''
|
25 |
+
for element in originContent.split("\n"):
|
26 |
+
print(element)
|
27 |
+
CONTENT_ENG += translate(element) + '\n'
|
28 |
+
article['content'] = repr(CONTENT_ENG)
|
29 |
+
article['site'] = "State Council of China"
|
30 |
+
article['originSite'] = "å½å”é¢"
|
31 |
+
article['originTitle'] = encode(page.xpath(xpath_dict[domain]['title']))
|
32 |
+
article['title'] = translate(article['originTitle'])
|
33 |
+
article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
|
34 |
+
article['link'] = url
|
35 |
+
article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime']['format_string'])
|
36 |
+
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
37 |
+
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
|
38 |
+
article['attachment'] = ""
|
39 |
+
upsert_content(article)
|
40 |
|
41 |
i = 0
|
42 |
while i > -1:
|
|
|
65 |
article = {}
|
66 |
url = url.replace('../', 'https://www.gov.cn/zhengce/')
|
67 |
if "https://www.gov.cn" in url:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
article['category']= "Policy Interpretation"
|
69 |
+
crawl(url, article)
|
|
|
|
|
|
|
70 |
except Exception as error:
|
71 |
print(error)
|
72 |
|
|
|
97 |
article = {}
|
98 |
url = url.replace('../', 'https://www.gov.cn/zhengce/')
|
99 |
if "https://www.gov.cn" in url:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
article['category']= "Policy Release"
|
101 |
+
crawl(url, article)
|
|
|
|
|
|
|
102 |
except Exception as error:
|
103 |
print(error)
|
utils.py
CHANGED
@@ -10,10 +10,10 @@ from googletrans import Translator
|
|
10 |
from transformers import pipeline
|
11 |
from PyPDF2 import PdfReader
|
12 |
|
13 |
-
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
14 |
-
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
15 |
-
|
16 |
-
|
17 |
|
18 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
19 |
|
@@ -44,6 +44,38 @@ def translist(infolist):
|
|
44 |
return out
|
45 |
|
46 |
def encode(content):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
"""Encode Function"""
|
48 |
text = ''
|
49 |
for element in content:
|
@@ -109,9 +141,9 @@ def sentiment_computation(content):
|
|
109 |
}
|
110 |
sentiment_score = 0
|
111 |
maximum_value = 0
|
112 |
-
raw_sentiment = analyzer(content[:512],
|
113 |
sentiment_label = None
|
114 |
-
for sentiment_dict in raw_sentiment
|
115 |
value = sentiment_dict["score"]
|
116 |
if value > maximum_value:
|
117 |
sentiment_label = sentiment_dict["label"]
|
@@ -127,25 +159,25 @@ def sentiment_computation(content):
|
|
127 |
def upsert_content(report):
|
128 |
"""Upsert the content records"""
|
129 |
dynamodb = get_db_connection()
|
130 |
-
table = dynamodb.Table('
|
131 |
# Define the item data
|
132 |
item = {
|
133 |
'id': str(report['id']),
|
134 |
'site': report['site'],
|
135 |
'title': report['title'],
|
136 |
-
# '
|
137 |
-
|
138 |
'originContent': report['originContent'],
|
139 |
'category': report['category'],
|
140 |
-
|
141 |
'content': report['content'],
|
142 |
-
'publishDate': report['
|
143 |
'link': report['link'],
|
144 |
-
|
145 |
# 'authorID': str(report['authorid']),
|
146 |
-
'entityList': report['entitylist'],
|
147 |
-
'sentimentScore': Decimal(str(report['
|
148 |
-
'sentimentLabel': report['
|
149 |
'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
|
150 |
'subtitle': report['subtitle']
|
151 |
}
|
@@ -165,7 +197,7 @@ def get_client_connection():
|
|
165 |
def delete_records(item):
|
166 |
dynamodb_client = get_client_connection()
|
167 |
dynamodb_client.delete_item(
|
168 |
-
TableName="
|
169 |
Key={
|
170 |
'id': {'S': item['id']},
|
171 |
'site': {'S': item['site']}
|
@@ -175,7 +207,7 @@ def delete_records(item):
|
|
175 |
def update_content(report):
|
176 |
dynamodb = get_client_connection()
|
177 |
response = dynamodb.update_item(
|
178 |
-
TableName="
|
179 |
Key={
|
180 |
'id': {'S': report['id']},
|
181 |
'site': {'S': report['site']}
|
|
|
10 |
from transformers import pipeline
|
11 |
from PyPDF2 import PdfReader
|
12 |
|
13 |
+
# AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
14 |
+
# AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
15 |
+
AWS_ACCESS_KEY_ID="AKIAQFXZMGHQYXKWUDWR"
|
16 |
+
AWS_SECRET_ACCESS_KEY="D2A0IEVl5g3Ljbu0Y5iq9WuFETpDeoEpl69C+6xo"
|
17 |
|
18 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
19 |
|
|
|
44 |
return out
|
45 |
|
46 |
def encode(content):
|
47 |
+
"""Encode Function"""
|
48 |
+
text = ''
|
49 |
+
for element in content:
|
50 |
+
if isinstance(element, etree._Element):
|
51 |
+
subelement = etree.tostring(element).decode()
|
52 |
+
subpage = etree.HTML(subelement)
|
53 |
+
tree = subpage.xpath('//text()')
|
54 |
+
line = ''.join(translist(tree)).\
|
55 |
+
replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
|
56 |
+
else:
|
57 |
+
line = element
|
58 |
+
text += line
|
59 |
+
return text
|
60 |
+
|
61 |
+
# def encode(content):
|
62 |
+
# """Encode Function"""
|
63 |
+
# text = ''
|
64 |
+
# for element in content:
|
65 |
+
# if isinstance(element, etree._Element):
|
66 |
+
# subelement = etree.tostring(element).decode()
|
67 |
+
# subpage = etree.HTML(subelement)
|
68 |
+
# tree = subpage.xpath('//text()')
|
69 |
+
# line = ''.join(translist(tree)).\
|
70 |
+
# replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
|
71 |
+
# else:
|
72 |
+
# line = element
|
73 |
+
# text += line
|
74 |
+
# index = text.find('ęå°ę¬é”µ')
|
75 |
+
# if index != -1:
|
76 |
+
# text = text[:index]
|
77 |
+
|
78 |
+
def encode_content(content):
|
79 |
"""Encode Function"""
|
80 |
text = ''
|
81 |
for element in content:
|
|
|
141 |
}
|
142 |
sentiment_score = 0
|
143 |
maximum_value = 0
|
144 |
+
raw_sentiment = analyzer(content[:512], top_k=None)
|
145 |
sentiment_label = None
|
146 |
+
for sentiment_dict in raw_sentiment:
|
147 |
value = sentiment_dict["score"]
|
148 |
if value > maximum_value:
|
149 |
sentiment_label = sentiment_dict["label"]
|
|
|
159 |
def upsert_content(report):
|
160 |
"""Upsert the content records"""
|
161 |
dynamodb = get_db_connection()
|
162 |
+
table = dynamodb.Table('article_test')
|
163 |
# Define the item data
|
164 |
item = {
|
165 |
'id': str(report['id']),
|
166 |
'site': report['site'],
|
167 |
'title': report['title'],
|
168 |
+
# 'originSite': report['originSite'],
|
169 |
+
'originTitle': report['originTitle'],
|
170 |
'originContent': report['originContent'],
|
171 |
'category': report['category'],
|
172 |
+
'author': report['author'],
|
173 |
'content': report['content'],
|
174 |
+
'publishDate': report['publishDate'],
|
175 |
'link': report['link'],
|
176 |
+
'attachment': report['attachment'],
|
177 |
# 'authorID': str(report['authorid']),
|
178 |
+
# 'entityList': report['entitylist'],
|
179 |
+
'sentimentScore': Decimal(str(report['sentimentScore'])).quantize(Decimal('0.01')),
|
180 |
+
'sentimentLabel': report['sentimentLabel'],
|
181 |
'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
|
182 |
'subtitle': report['subtitle']
|
183 |
}
|
|
|
197 |
def delete_records(item):
|
198 |
dynamodb_client = get_client_connection()
|
199 |
dynamodb_client.delete_item(
|
200 |
+
TableName="article_test",
|
201 |
Key={
|
202 |
'id': {'S': item['id']},
|
203 |
'site': {'S': item['site']}
|
|
|
207 |
def update_content(report):
|
208 |
dynamodb = get_client_connection()
|
209 |
response = dynamodb.update_item(
|
210 |
+
TableName="article_test",
|
211 |
Key={
|
212 |
'id': {'S': report['id']},
|
213 |
'site': {'S': report['site']}
|
xpath.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"data.eastmoney.com": {
|
3 |
+
"attachment": "//a[contains(@class, 'pdf-link')]/@href",
|
4 |
+
"content": "//div[contains(@class, 'ctx-content')]//p",
|
5 |
+
"datetime": {
|
6 |
+
"format_string": "%Y-%m-%d %H:%M:%S.%f"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"www.gov.cn": {
|
10 |
+
"title": "//title/text()",
|
11 |
+
"subtitle": "//meta[@name = 'description']/@content",
|
12 |
+
"author": "//meta[@name = 'author']/@content",
|
13 |
+
"publishdate": "//meta[@name = 'firstpublishedtime']/@content",
|
14 |
+
"content": "//div[contains(@id, 'UCAP-CONTENT')]//p",
|
15 |
+
"datetime": {
|
16 |
+
"format_string": "%Y-%m-%d-%H:%M:%S"
|
17 |
+
}
|
18 |
+
}
|
19 |
+
}
|