Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
# Set page configuration
|
7 |
+
st.set_page_config(
|
8 |
+
page_title="台灣證券交易所公告擷取工具",
|
9 |
+
page_icon="📊",
|
10 |
+
layout="wide"
|
11 |
+
)
|
12 |
+
|
13 |
+
# App title and description
|
14 |
+
st.title("台灣證券交易所公告擷取工具")
|
15 |
+
st.markdown("這個應用程式可以擷取台灣證券交易所的公司公告資訊")
|
16 |
+
|
17 |
+
def extract_data_from_html(html_content):
|
18 |
+
"""Extract data from HTML content and return as DataFrame"""
|
19 |
+
# Parse HTML content
|
20 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
21 |
+
|
22 |
+
# Find the table
|
23 |
+
table = soup.find('table', {'class': 'hasBorder'})
|
24 |
+
|
25 |
+
# Lists to store data
|
26 |
+
company_codes = []
|
27 |
+
company_names = []
|
28 |
+
announcement_dates = []
|
29 |
+
announcement_times = []
|
30 |
+
subjects = []
|
31 |
+
|
32 |
+
# If table exists, extract rows
|
33 |
+
if table:
|
34 |
+
# Find all rows in tbody (skip header)
|
35 |
+
tbody = table.find('tbody')
|
36 |
+
if tbody:
|
37 |
+
rows = tbody.find_all('tr')
|
38 |
+
else:
|
39 |
+
rows = table.find_all('tr')[1:] if len(table.find_all('tr')) > 1 else []
|
40 |
+
|
41 |
+
for row in rows:
|
42 |
+
# Extract cells
|
43 |
+
cells = row.find_all('td')
|
44 |
+
|
45 |
+
if len(cells) >= 5:
|
46 |
+
# Extract cell data
|
47 |
+
company_codes.append(cells[0].text.strip())
|
48 |
+
company_names.append(cells[1].text.strip())
|
49 |
+
announcement_dates.append(cells[2].text.strip())
|
50 |
+
announcement_times.append(cells[3].text.strip())
|
51 |
+
|
52 |
+
# Get subject from button title attribute if available
|
53 |
+
subject_cell = cells[4]
|
54 |
+
subject_button = subject_cell.find('button')
|
55 |
+
if subject_button and 'title' in subject_button.attrs:
|
56 |
+
subjects.append(subject_button['title'].strip())
|
57 |
+
else:
|
58 |
+
subjects.append(subject_cell.text.strip())
|
59 |
+
|
60 |
+
# Create DataFrame
|
61 |
+
df = pd.DataFrame({
|
62 |
+
'公司代號': company_codes,
|
63 |
+
'公司簡稱': company_names,
|
64 |
+
'發言日期': announcement_dates,
|
65 |
+
'發言時間': announcement_times,
|
66 |
+
'主旨': subjects
|
67 |
+
})
|
68 |
+
|
69 |
+
return df
|
70 |
+
|
71 |
+
# Function to extract data from the actual website
|
72 |
+
@st.cache_data(ttl=3600) # Cache data for 1 hour
|
73 |
+
def extract_data_from_website(url="https://mopsov.twse.com.tw/mops/web/t05sr01_1"):
|
74 |
+
headers = {
|
75 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
76 |
+
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7'
|
77 |
+
}
|
78 |
+
|
79 |
+
try:
|
80 |
+
with st.spinner('正在從網站擷取資料...'):
|
81 |
+
# Make request to get the session cookies first
|
82 |
+
session = requests.Session()
|
83 |
+
session.get("https://mopsov.twse.com.tw/mops/web/index", headers=headers)
|
84 |
+
|
85 |
+
# Now access the announcements page
|
86 |
+
response = session.get(url, headers=headers)
|
87 |
+
|
88 |
+
# Create form data for POST request to get the announcements
|
89 |
+
form_data = {
|
90 |
+
'step': '1',
|
91 |
+
'firstin': '1',
|
92 |
+
'off': '1',
|
93 |
+
'keyword4': '',
|
94 |
+
'code1': '',
|
95 |
+
'TYPEK2': '',
|
96 |
+
'checkbtn': '',
|
97 |
+
'queryName': 'co_id',
|
98 |
+
'inpuType': 'co_id',
|
99 |
+
'TYPEK': 'all',
|
100 |
+
'co_id': '',
|
101 |
+
'year': '',
|
102 |
+
'month': '',
|
103 |
+
'day': '',
|
104 |
+
'b_date': '',
|
105 |
+
'e_date': '',
|
106 |
+
}
|
107 |
+
|
108 |
+
# Make POST request
|
109 |
+
post_response = session.post(url, data=form_data, headers=headers)
|
110 |
+
|
111 |
+
# Parse the HTML content
|
112 |
+
df = extract_data_from_html(post_response.text)
|
113 |
+
|
114 |
+
if not df.empty:
|
115 |
+
st.success(f'成功擷取 {len(df)} 筆公告資料!')
|
116 |
+
return df
|
117 |
+
else:
|
118 |
+
st.warning('無法從網站擷取資料,切換到範例資料')
|
119 |
+
return None
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
st.error(f'訪問網站時發生錯誤: {e}')
|
123 |
+
return None
|
124 |
+
|
125 |
+
# Example provided in the original code
|
126 |
+
default_html_content = """
|
127 |
+
<table class="hasBorder"><thead><tr class="tblHead_2"><th width="10%" nowrap="">公司代號</th><th width="10%" nowrap="">公司簡稱</th><th nowrap="">發言日期</th><th width="10%" nowrap="">發言時間</th><th>主旨</th></tr></thead><tbody id="tab2"><tr class="even_2" onmouseover="this.className='mouseOn_2';" onmouseout="this.className='even_2';"><td>7724</td><td>諾亞克</td><td>114/04/01</td><td>00:06:30</td><td class="table02"><button style="width:300px;height:28px;text-align:left;background-color:transparent;border:0;cursor:pointer;" onclick="document.fm_t05sr01_1.step.value='1';document.fm_t05sr01_1.SEQ_NO.value='1';document.fm_t05sr01_1.SPOKE_TIME.value='630';document.fm_t05sr01_1.SPOKE_DATE.value='20250401';document.fm_t05sr01_1.COMPANY_NAME.value='諾亞克';document.fm_t05sr01_1.COMPANY_ID.value='7724';document.fm_t05sr01_1.skey.value='7724202504011';document.fm_t05sr01_1.hhc_co_name.value='諾亞克';openWindow(document.fm_t05sr01_1 ,'');" title="公告本公司董事會決議不分配113年度董事及員工酬勞">公告本公司董事會決議不分配113年度董事......</button></td></tr><tr class="odd_2" onmouseover="this.className='mouseOn_2';" onmouseout="this.className='odd_2';"><td>4117</td><td>普生</td><td>114/04/01</td><td>00:04:31</td><td class="table02"><button style="width:300px;height:28px;text-align:left;background-color:transparent;border:0;cursor:pointer;" onclick="document.fm_t05sr01_1.step.value='1';document.fm_t05sr01_1.SEQ_NO.value='7';document.fm_t05sr01_1.SPOKE_TIME.value='431';document.fm_t05sr01_1.SPOKE_DATE.value='20250401';document.fm_t05sr01_1.COMPANY_NAME.value='普生';document.fm_t05sr01_1.COMPANY_ID.value='4117';document.fm_t05sr01_1.skey.value='4117202503317';document.fm_t05sr01_1.hhc_co_name.value='普生';openWindow(document.fm_t05sr01_1 ,'');" title="公告本公司董事會決議不發放股利">公告本公司董事會決議不發放股利</button></td></tr></tbody></table>
|
128 |
+
"""
|
129 |
+
|
130 |
+
# Sidebar with data source options
|
131 |
+
st.sidebar.header("資料來源選項")
|
132 |
+
data_source = st.sidebar.radio(
|
133 |
+
"選擇資料來源",
|
134 |
+
["從網站擷取資料", "使用範例資料", "貼上HTML代碼"]
|
135 |
+
)
|
136 |
+
|
137 |
+
# Initialize data frame
|
138 |
+
df = None
|
139 |
+
|
140 |
+
# Process based on data source selection
|
141 |
+
if data_source == "從網站擷取資料":
|
142 |
+
df = extract_data_from_website()
|
143 |
+
if df is None:
|
144 |
+
st.sidebar.warning("從網站擷取資料失敗,切換到範例資料")
|
145 |
+
df = extract_data_from_html(default_html_content)
|
146 |
+
|
147 |
+
elif data_source == "使用範例資料":
|
148 |
+
df = extract_data_from_html(default_html_content)
|
149 |
+
|
150 |
+
else: # "貼上HTML代碼"
|
151 |
+
html_input = st.sidebar.text_area("貼上HTML代碼", value=default_html_content, height=300)
|
152 |
+
if st.sidebar.button("解析HTML"):
|
153 |
+
df = extract_data_from_html(html_input)
|
154 |
+
st.sidebar.success("HTML解析完成!")
|
155 |
+
|
156 |
+
# Display and filter data
|
157 |
+
if df is not None and not df.empty:
|
158 |
+
st.subheader("台灣證券交易所公告資料")
|
159 |
+
|
160 |
+
# Add search filters
|
161 |
+
col1, col2 = st.columns(2)
|
162 |
+
with col1:
|
163 |
+
search_code = st.text_input("依公司代號篩選")
|
164 |
+
with col2:
|
165 |
+
search_name = st.text_input("依公司名稱篩選")
|
166 |
+
|
167 |
+
# Apply filters if provided
|
168 |
+
filtered_df = df.copy()
|
169 |
+
if search_code:
|
170 |
+
filtered_df = filtered_df[filtered_df['公司代號'].str.contains(search_code)]
|
171 |
+
if search_name:
|
172 |
+
filtered_df = filtered_df[filtered_df['公司簡稱'].str.contains(search_name)]
|
173 |
+
|
174 |
+
# Display the data
|
175 |
+
st.dataframe(filtered_df, use_container_width=True)
|
176 |
+
|
177 |
+
# Download button
|
178 |
+
csv = filtered_df.to_csv(index=False).encode('utf-8-sig')
|
179 |
+
st.download_button(
|
180 |
+
label="下載為CSV",
|
181 |
+
data=csv,
|
182 |
+
file_name="twse_announcements.csv",
|
183 |
+
mime="text/csv",
|
184 |
+
)
|
185 |
+
|
186 |
+
# Display statistics
|
187 |
+
st.subheader("資料統計")
|
188 |
+
col1, col2 = st.columns(2)
|
189 |
+
with col1:
|
190 |
+
st.metric("公告總數", len(filtered_df))
|
191 |
+
with col2:
|
192 |
+
company_count = filtered_df['公司代號'].nunique()
|
193 |
+
st.metric("公司數量", company_count)
|
194 |
+
|
195 |
+
# Show announcement details on selection
|
196 |
+
if not filtered_df.empty:
|
197 |
+
st.subheader("選擇公告以查看詳情")
|
198 |
+
selected_indices = st.multiselect(
|
199 |
+
"選擇公告",
|
200 |
+
options=list(range(len(filtered_df))),
|
201 |
+
format_func=lambda i: f"{filtered_df.iloc[i]['公司簡稱']} - {filtered_df.iloc[i]['主旨'][:20]}..."
|
202 |
+
)
|
203 |
+
|
204 |
+
if selected_indices:
|
205 |
+
for idx in selected_indices:
|
206 |
+
with st.expander(f"{filtered_df.iloc[idx]['公司簡稱']} ({filtered_df.iloc[idx]['公司代號']}) - {filtered_df.iloc[idx]['發言日期']}"):
|
207 |
+
st.write(f"**公司代號:** {filtered_df.iloc[idx]['公司代號']}")
|
208 |
+
st.write(f"**公司簡稱:** {filtered_df.iloc[idx]['公司簡稱']}")
|
209 |
+
st.write(f"**發言日期:** {filtered_df.iloc[idx]['發言日期']}")
|
210 |
+
st.write(f"**發言時間:** {filtered_df.iloc[idx]['發言時間']}")
|
211 |
+
st.write(f"**主旨內容:** {filtered_df.iloc[idx]['主旨']}")
|
212 |
+
else:
|
213 |
+
st.warning("沒有可顯示的資料")
|
214 |
+
|
215 |
+
# Footer
|
216 |
+
st.markdown("---")
|
217 |
+
st.markdown("台灣證券交易所公告擷取工具 | 資料來源: [台灣證券交易所](https://mopsov.twse.com.tw/mops/web/index)")
|