dwb2023 commited on
Commit
3108ebd
ยท
1 Parent(s): 7b1d8b6
app.py CHANGED
@@ -15,10 +15,8 @@ Welcome to the **GDELT Insight Explorer**, a multi-faceted platform that leverag
15
  - Explore datasets, visualize event relationships, and analyze network structures.
16
 
17
  **Available Pages:**
18
- - **๐Ÿ—บ๏ธ COVID Navigator:** Dive into curated COVID-related event data.
19
  - **๐Ÿ” COVID Event Graph Explorer:** Inspect detailed event records and their interconnections.
20
- - **๐ŸŒ Global Network Analysis:** Visualize and analyze the global network of events.
21
- - **๐Ÿ—บ๏ธ Feb 2025 Navigator:** Investigate recent event data with advanced filtering.
22
  - **๐Ÿ” Feb 2025 Event Graph Explorer:** Inspect detailed event records and their interconnections.
23
  - **๐Ÿงช Feb 2025 Dataset Experimentation:** An experiment using the HF dataset directly to investigate impact on query behavior and performance.
24
  """)
 
15
  - Explore datasets, visualize event relationships, and analyze network structures.
16
 
17
  **Available Pages:**
 
18
  - **๐Ÿ” COVID Event Graph Explorer:** Inspect detailed event records and their interconnections.
19
+ - **๐ŸŒ COVID Network Analysis:** Visualize and analyze the global network of events.
 
20
  - **๐Ÿ” Feb 2025 Event Graph Explorer:** Inspect detailed event records and their interconnections.
21
  - **๐Ÿงช Feb 2025 Dataset Experimentation:** An experiment using the HF dataset directly to investigate impact on query behavior and performance.
22
  """)
pages/{1_๐Ÿ—บ๏ธ_COVID_Navigator.py โ†’ 1_๐Ÿ”_COVID_Event_Graph.py} RENAMED
File without changes
pages/{3_๐ŸŒ_COVID_Network_Analysis.py โ†’ 2_๐ŸŒ_COVID_Network_Analysis.py} RENAMED
File without changes
pages/2_๐Ÿ”_COVID_Event_Graph.py DELETED
@@ -1,189 +0,0 @@
1
- import streamlit as st
2
- import duckdb
3
- import pandas as pd
4
- from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
5
- from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
6
- from graph_builder import StLinkBuilder
7
-
8
- # Node styles configuration
9
- NODE_STYLES = [
10
- NodeStyle("EVENT", "#FF7F3E", "name", "description"),
11
- NodeStyle("PERSON", "#4CAF50", "name", "person"),
12
- NodeStyle("NAME", "#2A629A", "created_at", "badge"),
13
- NodeStyle("ORGANIZATION", "#9C27B0", "name", "business"),
14
- NodeStyle("LOCATION", "#2196F3", "name", "place"),
15
- NodeStyle("THEME", "#FFC107", "name", "sell"),
16
- NodeStyle("COUNT", "#795548", "name", "inventory"),
17
- NodeStyle("AMOUNT", "#607D8B", "name", "wallet"),
18
- ]
19
-
20
- # Edge styles configuration
21
- EDGE_STYLES = [
22
- EdgeStyle("MENTIONED_IN", caption="label", directed=True),
23
- EdgeStyle("LOCATED_IN", caption="label", directed=True),
24
- EdgeStyle("CATEGORIZED_AS", caption="label", directed=True)
25
- ]
26
-
27
- def initialize_db():
28
- """Initialize database connection and create dataset view"""
29
- con = duckdb.connect()
30
- con.execute("""
31
- CREATE VIEW negative_tone AS (
32
- SELECT *
33
- FROM read_parquet('hf://datasets/dwb2023/gdelt-gkg-march2020-v2@~parquet/default/negative_tone/*.parquet')
34
- );
35
- """)
36
- return con
37
-
38
- def fetch_data(con, source_filter=None,
39
- start_date=None, end_date=None, limit=50, include_all_columns=False):
40
- """Fetch filtered data from the database"""
41
- if include_all_columns:
42
- columns = "*"
43
- else:
44
- columns = "GKGRECORDID, DATE, SourceCommonName, tone, DocumentIdentifier, 'V2.1Quotations', SourceCollectionIdentifier"
45
-
46
- query = f"""
47
- SELECT {columns}
48
- FROM negative_tone
49
- WHERE TRUE
50
- """
51
- params = []
52
-
53
- if source_filter:
54
- query += " AND SourceCommonName ILIKE ?"
55
- params.append(f"%{source_filter}%")
56
- if start_date:
57
- query += " AND DATE >= ?"
58
- params.append(start_date)
59
- if end_date:
60
- query += " AND DATE <= ?"
61
- params.append(end_date)
62
- if limit:
63
- query += f" LIMIT {limit}"
64
-
65
- try:
66
- result = con.execute(query, params)
67
- return result.fetchdf()
68
- except Exception as e:
69
- st.error(f"Query execution failed: {str(e)}")
70
- return pd.DataFrame()
71
-
72
- def render_data_grid(df):
73
- """
74
- Render an interactive data grid (with builtโ€‘in filtering) and return the selected row.
75
- The grid is configured to show only the desired columns (ID, Date, Source, Tone)
76
- and allow filtering/search on each.
77
- """
78
- st.subheader("Search and Filter Records")
79
-
80
- # Build grid options with AgGrid
81
- gb = GridOptionsBuilder.from_dataframe(df)
82
- gb.configure_default_column(filter=True, sortable=True, resizable=True)
83
- # Enable single row selection
84
- gb.configure_selection('single', use_checkbox=False)
85
- grid_options = gb.build()
86
-
87
- # Render AgGrid (the grid will have a filter field for each column)
88
- grid_response = AgGrid(
89
- df,
90
- gridOptions=grid_options,
91
- update_mode=GridUpdateMode.SELECTION_CHANGED,
92
- height=400,
93
- fit_columns_on_grid_load=True
94
- )
95
-
96
- selected = grid_response.get('selected_rows')
97
- if selected is not None:
98
- # If selected is a DataFrame, use iloc to get the first row.
99
- if isinstance(selected, pd.DataFrame):
100
- if not selected.empty:
101
- return selected.iloc[0].to_dict()
102
- # Otherwise, if it's a list, get the first element.
103
- elif isinstance(selected, list) and len(selected) > 0:
104
- return selected[0]
105
- return None
106
-
107
- def render_graph(record):
108
- """
109
- Render a graph visualization for the selected record.
110
- Uses StLinkBuilder to convert the record into graph format and then
111
- displays the graph using st_link_analysis.
112
- """
113
- st.subheader(f"Event Graph: {record.get('GKGRECORDID', 'Unknown')}")
114
- stlink_builder = StLinkBuilder()
115
- # Convert the record (a Series) into a DataFrame with one row
116
- record_df = pd.DataFrame([record])
117
- graph_data = stlink_builder.build_graph(record_df)
118
- return st_link_analysis(
119
- elements=graph_data,
120
- layout="fcose", # Column configuration for data grid - cose, fcose, breadthfirst, cola
121
- node_styles=NODE_STYLES,
122
- edge_styles=EDGE_STYLES
123
- )
124
-
125
- def main():
126
- st.title("๐Ÿ” COVID Event Graph Explorer")
127
- st.markdown("""
128
- **Interactive Event Graph Viewer**
129
-
130
- Filter and select individual COVID-19 event records to display their detailed graph representations. Analyze relationships between events and associated entities using the interactive graph below.
131
- """)
132
-
133
- # Initialize database connection using context manager
134
- with initialize_db() as con:
135
- if con is not None:
136
- # Add UI components
137
-
138
- # Sidebar controls
139
- with st.sidebar:
140
- st.header("Search Filters")
141
- source = st.text_input("Filter by source name")
142
- start_date = st.text_input("Start date (YYYYMMDD)", "20200314")
143
- end_date = st.text_input("End date (YYYYMMDD)", "20200315")
144
- limit = st.slider("Number of results to display", 10, 500, 100)
145
-
146
- # Fetch initial data view
147
- df_initial = fetch_data(
148
- con=con,
149
- source_filter=source,
150
- start_date=start_date,
151
- end_date=end_date,
152
- limit=limit,
153
- include_all_columns=False
154
- )
155
-
156
- # Fetch full records for selection
157
- df_full = fetch_data(
158
- con=con,
159
- source_filter=source,
160
- start_date=start_date,
161
- end_date=end_date,
162
- limit=limit,
163
- include_all_columns=True
164
- )
165
-
166
- # Create a DataFrame for the grid with only the key columns
167
- grid_df = df_initial[['GKGRECORDID', 'DATE', 'SourceCommonName', 'tone', 'DocumentIdentifier', 'SourceCollectionIdentifier']].copy()
168
- grid_df.columns = ['ID', 'Date', 'Source', 'Tone', 'Doc ID', 'Source Collection ID']
169
-
170
- # Render the interactive data grid at the top
171
- selected_row = render_data_grid(grid_df)
172
-
173
- if selected_row:
174
- # Find the full record in the original DataFrame using the selected ID
175
- selected_id = selected_row['ID']
176
- full_record = df_full[df_full['GKGRECORDID'] == selected_id].iloc[0]
177
-
178
- # Display the graph and raw data below the grid
179
- render_graph(full_record)
180
- else:
181
- st.info("Use the grid filters above to search and select a record.")
182
-
183
- else:
184
- st.warning("No matching records found.")
185
-
186
- # Close database connection
187
- con.close()
188
-
189
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/{4_๐Ÿ—บ๏ธ_Feb_2025_Navigator.py โ†’ 3_๐Ÿ”_Feb_2025_Event_Graph.py} RENAMED
File without changes
pages/{6_๐Ÿงช_Feb_2025_Dataset_Explorer.py โ†’ 4_๐Ÿงช_Feb_2025_Dataset_Explorer.py} RENAMED
File without changes
pages/5_๐Ÿ”_Feb_2025_Event_Graph.py DELETED
@@ -1,200 +0,0 @@
1
- import streamlit as st
2
- import duckdb
3
- import pandas as pd
4
- from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
5
- from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
6
- from graph_builder import StLinkBuilder
7
-
8
- # Node styles configuration
9
- NODE_STYLES = [
10
- NodeStyle("EVENT", "#FF7F3E", "name", "description"),
11
- NodeStyle("PERSON", "#4CAF50", "name", "person"),
12
- NodeStyle("NAME", "#2A629A", "created_at", "badge"),
13
- NodeStyle("ORGANIZATION", "#9C27B0", "name", "business"),
14
- NodeStyle("LOCATION", "#2196F3", "name", "place"),
15
- NodeStyle("THEME", "#FFC107", "name", "sell"),
16
- NodeStyle("COUNT", "#795548", "name", "inventory"),
17
- NodeStyle("AMOUNT", "#607D8B", "name", "wallet"),
18
- ]
19
-
20
- # Edge styles configuration
21
- EDGE_STYLES = [
22
- EdgeStyle("MENTIONED_IN", caption="label", directed=True),
23
- EdgeStyle("LOCATED_IN", caption="label", directed=True),
24
- EdgeStyle("CATEGORIZED_AS", caption="label", directed=True)
25
- ]
26
-
27
- def initialize_db():
28
- """Initialize database connection and create dataset view with optimized tone extraction"""
29
- con = duckdb.connect()
30
- con.execute("""
31
- CREATE VIEW tone_vw AS (
32
- SELECT
33
- * EXCLUDE ("V1.5Tone"),
34
- TRY_CAST(
35
- CASE
36
- WHEN POSITION(',' IN "V1.5Tone") > 0
37
- THEN SUBSTRING("V1.5Tone", 1, POSITION(',' IN "V1.5Tone") - 1)
38
- ELSE "V1.5Tone"
39
- END
40
- AS FLOAT
41
- ) AS tone
42
- FROM read_parquet('hf://datasets/dwb2023/gdelt-gkg-2025-v2/**/*.parquet')
43
- );
44
- """)
45
- return con
46
-
47
- def fetch_data(con, source_filter=None,
48
- start_date=None, end_date=None, limit=50, include_all_columns=False):
49
- """Fetch filtered data from the database"""
50
- if include_all_columns:
51
- columns = "*"
52
- else:
53
- # Changed column specification: use double quotes for column names with periods.
54
- columns = 'GKGRECORDID, DATE, SourceCommonName, tone, DocumentIdentifier, "V2.1SharingImage", "V2.1Quotations", SourceCollectionIdentifier'
55
-
56
- query = f"""
57
- SELECT {columns}
58
- FROM tone_vw
59
- WHERE TRUE
60
- """
61
- params = []
62
-
63
- if source_filter:
64
- query += " AND SourceCommonName ILIKE ?"
65
- params.append(f"%{source_filter}%")
66
- if start_date:
67
- query += " AND DATE >= ?"
68
- params.append(start_date)
69
- if end_date:
70
- query += " AND DATE <= ?"
71
- params.append(end_date)
72
- if limit:
73
- query += f" LIMIT {limit}"
74
-
75
- try:
76
- result = con.execute(query, params)
77
- return result.fetchdf()
78
- except Exception as e:
79
- st.error(f"Query execution failed: {str(e)}")
80
- return pd.DataFrame()
81
-
82
- def render_data_grid(df):
83
- """
84
- Render an interactive data grid (with builtโ€‘in filtering) and return the selected row.
85
- The grid is configured to show only the desired columns (ID, Date, Source, Tone)
86
- and allow filtering/search on each.
87
- """
88
- st.subheader("Search and Filter Records")
89
-
90
- # Build grid options with AgGrid
91
- gb = GridOptionsBuilder.from_dataframe(df)
92
- gb.configure_default_column(filter=True, sortable=True, resizable=True)
93
- # Enable single row selection
94
- gb.configure_selection('single', use_checkbox=False)
95
- grid_options = gb.build()
96
-
97
- # Render AgGrid (the grid will have a filter field for each column)
98
- grid_response = AgGrid(
99
- df,
100
- gridOptions=grid_options,
101
- update_mode=GridUpdateMode.SELECTION_CHANGED,
102
- height=400,
103
- fit_columns_on_grid_load=True
104
- )
105
-
106
- selected = grid_response.get('selected_rows')
107
- if selected is not None:
108
- # If selected is a DataFrame, use iloc to get the first row.
109
- if isinstance(selected, pd.DataFrame):
110
- if not selected.empty:
111
- return selected.iloc[0].to_dict()
112
- # Otherwise, if it's a list, get the first element.
113
- elif isinstance(selected, list) and len(selected) > 0:
114
- return selected[0]
115
- return None
116
-
117
- def render_graph(record):
118
- """
119
- Render a graph visualization for the selected record.
120
- Uses StLinkBuilder to convert the record into graph format and then
121
- displays the graph using st_link_analysis.
122
- """
123
- st.subheader(f"Event Graph: {record.get('GKGRECORDID', 'Unknown')}")
124
- stlink_builder = StLinkBuilder()
125
- # Convert the record (a Series) into a DataFrame with one row
126
- record_df = pd.DataFrame([record])
127
- graph_data = stlink_builder.build_graph(record_df)
128
- return st_link_analysis(
129
- elements=graph_data,
130
- layout="fcose", # Column configuration for data grid - cose, fcose, breadthfirst, cola
131
- node_styles=NODE_STYLES,
132
- edge_styles=EDGE_STYLES
133
- )
134
-
135
- def main():
136
- st.title("๐Ÿ” GDELT Feb 2025 Event Graph Explorer")
137
- st.markdown("""
138
- **Investigate Recent Global Events (Feb 2025) in an Interactive Event Graph Viewer**
139
-
140
- Filter and select individual event records to display their detailed graph representations. Analyze relationships between events and associated entities using the interactive graph below.
141
- """)
142
-
143
-
144
- # Initialize database connection using context manager
145
- with initialize_db() as con:
146
- if con is not None:
147
- # Add UI components
148
-
149
- # Sidebar controls
150
- with st.sidebar:
151
- st.header("Search Filters")
152
- source = st.text_input("Filter by source name")
153
- start_date = st.text_input("Start date (YYYYMMDD)", "20250210")
154
- end_date = st.text_input("End date (YYYYMMDD)", "20250211")
155
- limit = st.slider("Number of results to display", 10, 500, 100)
156
-
157
- # Fetch initial data view
158
- df_initial = fetch_data(
159
- con=con,
160
- source_filter=source,
161
- start_date=start_date,
162
- end_date=end_date,
163
- limit=limit,
164
- include_all_columns=False
165
- )
166
-
167
- # Fetch full records for selection
168
- df_full = fetch_data(
169
- con=con,
170
- source_filter=source,
171
- start_date=start_date,
172
- end_date=end_date,
173
- limit=limit,
174
- include_all_columns=True
175
- )
176
-
177
- # Create a DataFrame for the grid with only the key columns
178
- grid_df = df_initial[['GKGRECORDID', 'DATE', 'SourceCommonName', 'tone', 'DocumentIdentifier', "V2.1SharingImage", 'SourceCollectionIdentifier']].copy()
179
- grid_df.columns = ['ID', 'Date', 'Source', 'Tone', 'Doc ID', 'Image', 'Source Collection ID']
180
-
181
- # Render the interactive data grid at the top
182
- selected_row = render_data_grid(grid_df)
183
-
184
- if selected_row:
185
- # Find the full record in the original DataFrame using the selected ID
186
- selected_id = selected_row['ID']
187
- full_record = df_full[df_full['GKGRECORDID'] == selected_id].iloc[0]
188
-
189
- # Display the graph and raw data below the grid
190
- render_graph(full_record)
191
- else:
192
- st.info("Use the grid filters above to search and select a record.")
193
-
194
- else:
195
- st.warning("No matching records found.")
196
-
197
- # Close database connection
198
- con.close()
199
-
200
- main()