Edenhuang commited on
Commit
1705eb7
·
verified ·
1 Parent(s): ca93ad0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -66
app.py CHANGED
@@ -2,6 +2,9 @@ import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import pandas as pd
 
 
 
5
 
6
  # Set page configuration
7
  st.set_page_config(
@@ -69,80 +72,174 @@ def extract_data_from_html(html_content):
69
  return df
70
 
71
  # Function to extract data from the actual website
72
- @st.cache_data(ttl=3600) # Cache data for 1 hour
73
- def extract_data_from_website(url="https://mopsov.twse.com.tw/mops/web/t05sr01_1"):
 
 
 
 
 
 
 
 
 
74
  headers = {
75
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
76
- 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7'
 
 
 
 
 
 
 
 
 
77
  }
78
 
79
- try:
80
- with st.spinner('正在從網站擷取資料...'):
81
- # Make request to get the session cookies first
82
- session = requests.Session()
83
- session.get("https://mopsov.twse.com.tw/mops/web/index", headers=headers)
84
-
85
- # Now access the announcements page
86
- response = session.get(url, headers=headers)
87
-
88
- # Create form data for POST request to get the announcements
89
- form_data = {
90
- 'step': '1',
91
- 'firstin': '1',
92
- 'off': '1',
93
- 'keyword4': '',
94
- 'code1': '',
95
- 'TYPEK2': '',
96
- 'checkbtn': '',
97
- 'queryName': 'co_id',
98
- 'inpuType': 'co_id',
99
- 'TYPEK': 'all',
100
- 'co_id': '',
101
- 'year': '',
102
- 'month': '',
103
- 'day': '',
104
- 'b_date': '',
105
- 'e_date': '',
106
- }
107
-
108
- # Make POST request
109
- post_response = session.post(url, data=form_data, headers=headers)
110
-
111
- # Parse the HTML content
112
- df = extract_data_from_html(post_response.text)
113
-
114
- if not df.empty:
115
- st.success(f'成功擷取 {len(df)} 筆公告資料!')
116
- return df
117
- else:
118
- st.warning('無法從網站擷取資料,切換到範例資料')
119
- return None
120
 
121
- except Exception as e:
122
- st.error(f'訪問網站時發生錯誤: {e}')
123
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  # Example provided in the original code
126
  default_html_content = """
127
  <table class="hasBorder"><thead><tr class="tblHead_2"><th width="10%" nowrap="">公司代號</th><th width="10%" nowrap="">公司簡稱</th><th nowrap="">發言日期</th><th width="10%" nowrap="">發言時間</th><th>主旨</th></tr></thead><tbody id="tab2"><tr class="even_2" onmouseover="this.className='mouseOn_2';" onmouseout="this.className='even_2';"><td>7724</td><td>諾亞克</td><td>114/04/01</td><td>00:06:30</td><td class="table02"><button style="width:300px;height:28px;text-align:left;background-color:transparent;border:0;cursor:pointer;" onclick="document.fm_t05sr01_1.step.value='1';document.fm_t05sr01_1.SEQ_NO.value='1';document.fm_t05sr01_1.SPOKE_TIME.value='630';document.fm_t05sr01_1.SPOKE_DATE.value='20250401';document.fm_t05sr01_1.COMPANY_NAME.value='諾亞克';document.fm_t05sr01_1.COMPANY_ID.value='7724';document.fm_t05sr01_1.skey.value='7724202504011';document.fm_t05sr01_1.hhc_co_name.value='諾亞克';openWindow(document.fm_t05sr01_1 ,'');" title="公告本公司董事會決議不分配113年度董事及員工酬勞">公告本公司董事會決議不分配113年度董事......</button></td></tr><tr class="odd_2" onmouseover="this.className='mouseOn_2';" onmouseout="this.className='odd_2';"><td>4117</td><td>普生</td><td>114/04/01</td><td>00:04:31</td><td class="table02"><button style="width:300px;height:28px;text-align:left;background-color:transparent;border:0;cursor:pointer;" onclick="document.fm_t05sr01_1.step.value='1';document.fm_t05sr01_1.SEQ_NO.value='7';document.fm_t05sr01_1.SPOKE_TIME.value='431';document.fm_t05sr01_1.SPOKE_DATE.value='20250401';document.fm_t05sr01_1.COMPANY_NAME.value='普生';document.fm_t05sr01_1.COMPANY_ID.value='4117';document.fm_t05sr01_1.skey.value='4117202503317';document.fm_t05sr01_1.hhc_co_name.value='普生';openWindow(document.fm_t05sr01_1 ,'');" title="公告本公司董事會決議不發放股利">公告本公司董事會決議不發放股利</button></td></tr></tbody></table>
128
  """
129
 
130
- # Sidebar with data source options
131
  st.sidebar.header("資料來源選項")
132
  data_source = st.sidebar.radio(
133
  "選擇資料來源",
134
  ["從網站擷取資料", "使用範例資料", "貼上HTML代碼"]
135
  )
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  # Initialize data frame
138
  df = None
139
 
140
- # Process based on data source selection
141
  if data_source == "從網站擷取資料":
142
- df = extract_data_from_website()
143
- if df is None:
144
- st.sidebar.warning("從網站擷取資料失敗,切換到範例資料")
145
- df = extract_data_from_html(default_html_content)
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  elif data_source == "使用範例資料":
148
  df = extract_data_from_html(default_html_content)
@@ -158,11 +255,13 @@ if df is not None and not df.empty:
158
  st.subheader("台灣證券交易所公告資料")
159
 
160
  # Add search filters
161
- col1, col2 = st.columns(2)
162
  with col1:
163
  search_code = st.text_input("依公司代號篩選")
164
  with col2:
165
  search_name = st.text_input("依公司名稱篩選")
 
 
166
 
167
  # Apply filters if provided
168
  filtered_df = df.copy()
@@ -170,27 +269,51 @@ if df is not None and not df.empty:
170
  filtered_df = filtered_df[filtered_df['公司代號'].str.contains(search_code)]
171
  if search_name:
172
  filtered_df = filtered_df[filtered_df['公司簡稱'].str.contains(search_name)]
 
 
173
 
174
  # Display the data
175
  st.dataframe(filtered_df, use_container_width=True)
176
 
177
- # Download button
178
- csv = filtered_df.to_csv(index=False).encode('utf-8-sig')
179
- st.download_button(
180
- label="下載為CSV",
181
- data=csv,
182
- file_name="twse_announcements.csv",
183
- mime="text/csv",
184
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  # Display statistics
187
  st.subheader("資料統計")
188
- col1, col2 = st.columns(2)
189
  with col1:
190
  st.metric("公告總數", len(filtered_df))
191
  with col2:
192
  company_count = filtered_df['公司代號'].nunique()
193
  st.metric("公司數量", company_count)
 
 
 
 
 
 
194
 
195
  # Show announcement details on selection
196
  if not filtered_df.empty:
@@ -198,7 +321,7 @@ if df is not None and not df.empty:
198
  selected_indices = st.multiselect(
199
  "選擇公告",
200
  options=list(range(len(filtered_df))),
201
- format_func=lambda i: f"{filtered_df.iloc[i]['公司簡稱']} - {filtered_df.iloc[i]['主旨'][:20]}..."
202
  )
203
 
204
  if selected_indices:
@@ -214,4 +337,35 @@ else:
214
 
215
  # Footer
216
  st.markdown("---")
217
- st.markdown("台灣證券交易所公告擷取工具 | 資料來源: [台灣證券交易所](https://mopsov.twse.com.tw/mops/web/index)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import pandas as pd
5
+ import time
6
+ import random
7
+ from datetime import datetime
8
 
9
  # Set page configuration
10
  st.set_page_config(
 
72
  return df
73
 
74
  # Function to extract data from the actual website
75
+ @st.cache_data(ttl=1800) # Cache data for 30 minutes
76
+ def extract_data_from_website(url="https://mopsov.twse.com.tw/mops/web/t05sr01_1", retries=3):
77
+ # Rotating User-Agents to avoid detection
78
+ user_agents = [
79
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
80
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15',
81
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
82
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
83
+ ]
84
+
85
+ # More comprehensive headers
86
  headers = {
87
+ 'User-Agent': random.choice(user_agents),
88
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
89
+ 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
90
+ 'Accept-Encoding': 'gzip, deflate, br',
91
+ 'Connection': 'keep-alive',
92
+ 'Upgrade-Insecure-Requests': '1',
93
+ 'Sec-Fetch-Dest': 'document',
94
+ 'Sec-Fetch-Mode': 'navigate',
95
+ 'Sec-Fetch-Site': 'none',
96
+ 'Sec-Fetch-User': '?1',
97
+ 'Cache-Control': 'max-age=0'
98
  }
99
 
100
+ # Get current date in Taiwan format (ROC calendar)
101
+ now = datetime.now()
102
+ roc_year = now.year - 1911
103
+ current_date = f"{roc_year}/{now.month:02d}/{now.day:02d}"
104
+
105
+ for attempt in range(retries):
106
+ try:
107
+ with st.spinner(f'正在從網站擷取資料... (嘗試 {attempt+1}/{retries})'):
108
+ # Create a session to maintain cookies
109
+ session = requests.Session()
110
+
111
+ # Initial visit to homepage to get cookies
112
+ session.get("https://mopsov.twse.com.tw/mops/web/index", headers=headers, timeout=15)
113
+
114
+ # Small delay to avoid triggering anti-scraping measures
115
+ time.sleep(random.uniform(1, 3))
116
+
117
+ # Visit the announcements page to get the form structure
118
+ response = session.get(url, headers=headers, timeout=15)
119
+
120
+ # Another small delay
121
+ time.sleep(random.uniform(1, 2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
+ # Create form data for POST request to get the announcements
124
+ # Enhanced form data with more parameters
125
+ form_data = {
126
+ 'step': '1',
127
+ 'firstin': '1',
128
+ 'off': '1',
129
+ 'keyword4': '',
130
+ 'code1': '',
131
+ 'TYPEK2': '',
132
+ 'checkbtn': '',
133
+ 'queryName': 'co_id',
134
+ 'inpuType': 'co_id',
135
+ 'TYPEK': 'all',
136
+ 'co_id': '',
137
+ 'year': str(roc_year), # Current ROC year
138
+ 'month': str(now.month), # Current month
139
+ 'day': str(now.day), # Current day
140
+ 'b_date': '',
141
+ 'e_date': '',
142
+ 'skey': '',
143
+ 'date1': '',
144
+ 'date2': '',
145
+ }
146
+
147
+ # Make POST request
148
+ post_headers = headers.copy()
149
+ post_headers['Content-Type'] = 'application/x-www-form-urlencoded'
150
+ post_headers['Origin'] = 'https://mopsov.twse.com.tw'
151
+ post_headers['Referer'] = url
152
+
153
+ post_response = session.post(
154
+ url,
155
+ data=form_data,
156
+ headers=post_headers,
157
+ timeout=20
158
+ )
159
+
160
+ # Check if the response seems valid
161
+ if "hasBorder" in post_response.text and post_response.status_code == 200:
162
+ # Parse the HTML content
163
+ df = extract_data_from_html(post_response.text)
164
+
165
+ if not df.empty:
166
+ st.success(f'成功擷取 {len(df)} 筆公告資料!')
167
+ return df
168
+ else:
169
+ st.info(f'網站回應成功,但找不到公告資料。可能是當日({current_date})尚無公告。')
170
+ # Attempt to look for other messages in the response
171
+ soup = BeautifulSoup(post_response.text, 'html.parser')
172
+ messages = soup.find_all('td', {'class': 'compName'})
173
+ if messages:
174
+ st.info(f"網站訊息: {messages[0].text.strip()}")
175
+ continue
176
+ else:
177
+ st.warning(f'網站返回狀態碼: {post_response.status_code}。嘗試重新連接...')
178
+ continue
179
+
180
+ except requests.exceptions.RequestException as e:
181
+ st.warning(f'請求錯誤 (嘗試 {attempt+1}/{retries}): {str(e)}')
182
+ time.sleep(2) # Wait before retrying
183
+ except Exception as e:
184
+ st.warning(f'處理錯誤 (嘗試 {attempt+1}/{retries}): {str(e)}')
185
+ time.sleep(2) # Wait before retrying
186
+
187
+ st.warning(f'嘗試 {retries} 次後仍無法從網站擷取資料,切換到範例資料')
188
+ return None
189
 
190
  # Example provided in the original code
191
  default_html_content = """
192
  <table class="hasBorder"><thead><tr class="tblHead_2"><th width="10%" nowrap="">公司代號</th><th width="10%" nowrap="">公司簡稱</th><th nowrap="">發言日期</th><th width="10%" nowrap="">發言時間</th><th>主旨</th></tr></thead><tbody id="tab2"><tr class="even_2" onmouseover="this.className='mouseOn_2';" onmouseout="this.className='even_2';"><td>7724</td><td>諾亞克</td><td>114/04/01</td><td>00:06:30</td><td class="table02"><button style="width:300px;height:28px;text-align:left;background-color:transparent;border:0;cursor:pointer;" onclick="document.fm_t05sr01_1.step.value='1';document.fm_t05sr01_1.SEQ_NO.value='1';document.fm_t05sr01_1.SPOKE_TIME.value='630';document.fm_t05sr01_1.SPOKE_DATE.value='20250401';document.fm_t05sr01_1.COMPANY_NAME.value='諾亞克';document.fm_t05sr01_1.COMPANY_ID.value='7724';document.fm_t05sr01_1.skey.value='7724202504011';document.fm_t05sr01_1.hhc_co_name.value='諾亞克';openWindow(document.fm_t05sr01_1 ,'');" title="公告本公司董事會決議不分配113年度董事及員工酬勞">公告本公司董事會決議不分配113年度董事......</button></td></tr><tr class="odd_2" onmouseover="this.className='mouseOn_2';" onmouseout="this.className='odd_2';"><td>4117</td><td>普生</td><td>114/04/01</td><td>00:04:31</td><td class="table02"><button style="width:300px;height:28px;text-align:left;background-color:transparent;border:0;cursor:pointer;" onclick="document.fm_t05sr01_1.step.value='1';document.fm_t05sr01_1.SEQ_NO.value='7';document.fm_t05sr01_1.SPOKE_TIME.value='431';document.fm_t05sr01_1.SPOKE_DATE.value='20250401';document.fm_t05sr01_1.COMPANY_NAME.value='普生';document.fm_t05sr01_1.COMPANY_ID.value='4117';document.fm_t05sr01_1.skey.value='4117202503317';document.fm_t05sr01_1.hhc_co_name.value='普生';openWindow(document.fm_t05sr01_1 ,'');" title="公告本公司董事會決議不發放股利">公告本公司董事會決議不發放股利</button></td></tr></tbody></table>
193
  """
194
 
195
+ # Add date range picker to sidebar
196
  st.sidebar.header("資料來源選項")
197
  data_source = st.sidebar.radio(
198
  "選擇資料來源",
199
  ["從網站擷取資料", "使用範例資料", "貼上HTML代碼"]
200
  )
201
 
202
+ # Date range selector (only visible when fetching from website)
203
+ if data_source == "從網站擷取資料":
204
+ st.sidebar.subheader("日期選擇")
205
+
206
+ # Calculate ROC year (Taiwan calendar)
207
+ current_year = datetime.now().year
208
+ roc_year = current_year - 1911
209
+
210
+ # Date inputs
211
+ col1, col2 = st.sidebar.columns(2)
212
+ with col1:
213
+ year = st.number_input("年度(民國)", min_value=100, max_value=roc_year, value=roc_year)
214
+ with col2:
215
+ month = st.number_input("月份", min_value=1, max_value=12, value=datetime.now().month)
216
+
217
+ custom_date = st.sidebar.checkbox("指定日期範圍")
218
+ if custom_date:
219
+ start_date = st.sidebar.date_input("起始日期")
220
+ end_date = st.sidebar.date_input("結束日期")
221
+
222
  # Initialize data frame
223
  df = None
224
 
225
+ # Add progress
226
  if data_source == "從網站擷取資料":
227
+ with st.expander("網路連線診斷", expanded=False):
228
+ st.write("檢查台灣證券交易所網站連線...")
229
+ try:
230
+ check_response = requests.get("https://mopsov.twse.com.tw/", timeout=5)
231
+ st.write(f"網站狀態: {'可連線 ✅' if check_response.status_code == 200 else '無法連線 ❌'}")
232
+ st.write(f"HTTP 狀態碼: {check_response.status_code}")
233
+ except Exception as e:
234
+ st.write(f"網站連線檢查失敗: {e}")
235
+
236
+ fetch_data = st.button("開始擷取資料", type="primary")
237
+ if fetch_data:
238
+ # This will be enhanced to use the date parameters when implemented
239
+ df = extract_data_from_website()
240
+ if df is None:
241
+ st.sidebar.warning("從網站擷取資料失敗,切換到範例資料")
242
+ df = extract_data_from_html(default_html_content)
243
 
244
  elif data_source == "使用範例資料":
245
  df = extract_data_from_html(default_html_content)
 
255
  st.subheader("台灣證券交易所公告資料")
256
 
257
  # Add search filters
258
+ col1, col2, col3 = st.columns(3)
259
  with col1:
260
  search_code = st.text_input("依公司代號篩選")
261
  with col2:
262
  search_name = st.text_input("依公司名稱篩選")
263
+ with col3:
264
+ search_subject = st.text_input("依主旨關鍵字篩選")
265
 
266
  # Apply filters if provided
267
  filtered_df = df.copy()
 
269
  filtered_df = filtered_df[filtered_df['公司代號'].str.contains(search_code)]
270
  if search_name:
271
  filtered_df = filtered_df[filtered_df['公司簡稱'].str.contains(search_name)]
272
+ if search_subject:
273
+ filtered_df = filtered_df[filtered_df['主旨'].str.contains(search_subject)]
274
 
275
  # Display the data
276
  st.dataframe(filtered_df, use_container_width=True)
277
 
278
+ # Download buttons
279
+ col1, col2 = st.columns(2)
280
+ with col1:
281
+ csv = filtered_df.to_csv(index=False).encode('utf-8-sig')
282
+ st.download_button(
283
+ label="下載為CSV",
284
+ data=csv,
285
+ file_name="twse_announcements.csv",
286
+ mime="text/csv",
287
+ )
288
+ with col2:
289
+ excel_buffer = pd.ExcelWriter("twse_announcements.xlsx", engine='xlsxwriter')
290
+ filtered_df.to_excel(excel_buffer, index=False, sheet_name='公告資料')
291
+ excel_buffer.close()
292
+
293
+ with open("twse_announcements.xlsx", "rb") as f:
294
+ excel_data = f.read()
295
+
296
+ st.download_button(
297
+ label="下載為Excel",
298
+ data=excel_data,
299
+ file_name="twse_announcements.xlsx",
300
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
301
+ )
302
 
303
  # Display statistics
304
  st.subheader("資料統計")
305
+ col1, col2, col3 = st.columns(3)
306
  with col1:
307
  st.metric("公告總數", len(filtered_df))
308
  with col2:
309
  company_count = filtered_df['公司代號'].nunique()
310
  st.metric("公司數量", company_count)
311
+ with col3:
312
+ date_counts = filtered_df['發言日期'].value_counts()
313
+ if not date_counts.empty:
314
+ latest_date = date_counts.index[0]
315
+ latest_count = date_counts.iloc[0]
316
+ st.metric(f"最新日期 ({latest_date})", latest_count)
317
 
318
  # Show announcement details on selection
319
  if not filtered_df.empty:
 
321
  selected_indices = st.multiselect(
322
  "選擇公告",
323
  options=list(range(len(filtered_df))),
324
+ format_func=lambda i: f"{filtered_df.iloc[i]['公司簡稱']} ({filtered_df.iloc[i]['公司代號']}) - {filtered_df.iloc[i]['主旨'][:20]}..."
325
  )
326
 
327
  if selected_indices:
 
337
 
338
  # Footer
339
  st.markdown("---")
340
+ st.markdown("台灣證券交易所公告擷取工具 | 資料來源: [台灣證券交易所](https://mopsov.twse.com.tw/mops/web/index)")
341
+
342
+ # Add FAQ section at the bottom
343
+ with st.expander("常見問題", expanded=False):
344
+ st.subheader("常見問題")
345
+
346
+ st.markdown("""
347
+ **Q: 為什麼無法從網站擷取資料?**
348
+
349
+ A: 可能原因包括:
350
+ - 台灣證券交易所網站暫時無法連接
351
+ - 當日尚無公告資料
352
+ - 網站結構可能有所變更
353
+ - 網路連線問題
354
+
355
+ **Q: 資料顯示的日期格式是什麼?**
356
+
357
+ A: 發言日期採用中華民國紀年(民國紀年),例如「114/04/01」表示西元2025年4月1日。
358
+
359
+ **Q: 為什麼有些公告的主旨只顯示部分內容?**
360
+
361
+ A: 當主旨內容過長時,網站顯示會自動截斷。點選公告查看詳情可能會顯示完整主旨。
362
+
363
+ **Q: 如何取得更多歷史公告?**
364
+
365
+ A: 本工具目前僅擷取當前頁面資料。若需查詢歷史資料,建議直接前往[台灣證券交易所](https://mopsov.twse.com.tw/mops/web/index)官方網站搜尋。
366
+ """)
367
+
368
+ # Add version info
369
+ st.sidebar.markdown("---")
370
+ st.sidebar.caption("版本: 1.1.0")
371
+ st.sidebar.caption("最後更新: 2025-04-01")