insight / pages /3_๐Ÿ”_Feb_2025_Event_Graph.py
dwb2023's picture
fix captions
260e16d
import streamlit as st
import duckdb
import pandas as pd
from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
from graph_builder import StLinkBuilder
# Node styles configuration
NODE_STYLES = [
NodeStyle("EVENT", "#FF7F3E", "name", "description"),
NodeStyle("PERSON", "#4CAF50", "name", "person"),
NodeStyle("NAME", "#2A629A", "name", "badge"),
NodeStyle("ORGANIZATION", "#9C27B0", "name", "business"),
NodeStyle("LOCATION", "#2196F3", "name", "place"),
NodeStyle("THEME", "#FFC107", "name", "sell"),
NodeStyle("COUNT", "#795548", "name", "inventory"),
NodeStyle("AMOUNT", "#607D8B", "name", "wallet"),
]
# Edge styles configuration
EDGE_STYLES = [
EdgeStyle("MENTIONED_IN", caption="label", directed=True),
EdgeStyle("LOCATED_IN", caption="label", directed=True),
EdgeStyle("CATEGORIZED_AS", caption="label", directed=True)
]
# Constants for raw data categories
GDELT_CATEGORIES = {
"Metadata": ["GKGRECORDID", "DATE", "SourceCommonName", "DocumentIdentifier", "V2.1SharingImage", "V2.1Quotations", "tone"],
"Persons": ["V2EnhancedPersons", "V1Persons"],
"Organizations": ["V2EnhancedOrganizations", "V1Organizations"],
"Locations": ["V2EnhancedLocations", "V1Locations"],
"Themes": ["V2EnhancedThemes", "V1Themes"],
"Names": ["V2.1AllNames"],
"Counts": ["V2.1Counts", "V1Counts"],
"Amounts": ["V2.1Amounts"],
"V2GCAM": ["V2GCAM"],
"V2.1EnhancedDates": ["V2.1EnhancedDates"],
}
def initialize_db():
"""Initialize database connection and create dataset view with optimized tone extraction"""
con = duckdb.connect()
con.execute("""
CREATE VIEW tone_vw AS (
SELECT
* EXCLUDE ("V1.5Tone"),
TRY_CAST(
CASE
WHEN POSITION(',' IN "V1.5Tone") > 0
THEN SUBSTRING("V1.5Tone", 1, POSITION(',' IN "V1.5Tone") - 1)
ELSE "V1.5Tone"
END
AS FLOAT
) AS tone
FROM read_parquet('hf://datasets/dwb2023/gdelt-gkg-2025-v2/**/*.parquet')
);
""")
return con
def fetch_data(con, source_filter=None,
start_date=None, end_date=None, limit=100, include_all_columns=False):
"""Fetch filtered data from the database"""
if include_all_columns:
columns = "*"
else:
# Changed column specification: use double quotes for column names with periods.
columns = 'GKGRECORDID, DATE, SourceCommonName, tone, DocumentIdentifier, "V2.1SharingImage", "V2.1Quotations", SourceCollectionIdentifier'
query = f"""
SELECT {columns}
FROM tone_vw
WHERE TRUE
"""
params = []
if source_filter:
query += " AND SourceCommonName ILIKE ?"
params.append(f"%{source_filter}%")
if start_date:
query += " AND DATE >= ?"
params.append(start_date)
if end_date:
query += " AND DATE <= ?"
params.append(end_date)
if limit:
query += f" LIMIT {limit}"
try:
result = con.execute(query, params)
return result.fetchdf()
except Exception as e:
st.error(f"Query execution failed: {str(e)}")
return pd.DataFrame()
def render_data_grid(df):
"""
Render an interactive data grid (with builtโ€‘in filtering) and return the selected row.
The grid is configured to show only the desired columns (ID, Date, Source, Tone)
and allow filtering/search on each.
"""
st.subheader("Search and Filter Records")
# Build grid options with AgGrid
gb = GridOptionsBuilder.from_dataframe(df)
gb.configure_default_column(filter=True, sortable=True, resizable=True)
# Enable single row selection
gb.configure_selection('single', use_checkbox=False)
grid_options = gb.build()
# Render AgGrid (the grid will have a filter field for each column)
grid_response = AgGrid(
df,
gridOptions=grid_options,
update_mode=GridUpdateMode.SELECTION_CHANGED,
height=400,
fit_columns_on_grid_load=True
)
selected = grid_response.get('selected_rows')
if selected is not None:
# If selected is a DataFrame, use iloc to get the first row.
if isinstance(selected, pd.DataFrame):
if not selected.empty:
return selected.iloc[0].to_dict()
# Otherwise, if it's a list, get the first element.
elif isinstance(selected, list) and len(selected) > 0:
return selected[0]
return None
def render_graph(record):
"""
Render a graph visualization for the selected record.
Uses StLinkBuilder to convert the record into graph format and then
displays the graph using st_link_analysis.
"""
st.subheader(f"Event Graph: {record.get('GKGRECORDID', 'Unknown')}")
stlink_builder = StLinkBuilder()
# Convert the record (a Series) into a DataFrame with one row
record_df = pd.DataFrame([record])
graph_data = stlink_builder.build_graph(record_df)
return st_link_analysis(
elements=graph_data,
layout="fcose", # Column configuration for data grid - cose, fcose, breadthfirst, cola
node_styles=NODE_STYLES,
edge_styles=EDGE_STYLES
)
def render_raw_data(record):
"""Render raw GDELT data in expandable sections."""
st.header("Full Record Details")
for category, fields in GDELT_CATEGORIES.items():
with st.expander(f"{category}"):
for field in fields:
if field in record:
st.markdown(f"**{field}:**")
st.text(record[field])
st.divider()
def main():
st.title("๐Ÿ” GDELT Feb 2025 Event Graph Explorer")
st.markdown("""
**Investigate Recent Global Events (Feb 2025) in an Interactive Event Graph Viewer**
Filter and select individual event records to display their detailed graph representations. Analyze relationships between events and associated entities using the interactive graph below.
""")
# Initialize database connection using context manager
with initialize_db() as con:
if con is not None:
# Add UI components
# Sidebar controls
with st.sidebar:
st.header("Search Filters")
source = st.text_input("Filter by source name")
start_date = st.text_input("Start date (YYYYMMDD)", "20250210")
end_date = st.text_input("End date (YYYYMMDD)", "20250211")
limit = st.slider("Number of results to display", 10, 500, 100)
# Fetch initial data view
df_initial = fetch_data(
con=con,
source_filter=source,
start_date=start_date,
end_date=end_date,
limit=limit,
include_all_columns=False
)
# Fetch full records for selection
df_full = fetch_data(
con=con,
source_filter=source,
start_date=start_date,
end_date=end_date,
limit=limit,
include_all_columns=True
)
# Create a DataFrame for the grid with only the key columns
grid_df = df_initial[['GKGRECORDID', 'DATE', 'SourceCommonName', 'tone', 'DocumentIdentifier', "V2.1SharingImage", 'SourceCollectionIdentifier']].copy()
grid_df.columns = ['ID', 'Date', 'Source', 'Tone', 'Doc ID', 'Image', 'Source Collection ID']
# Render the interactive data grid at the top
selected_row = render_data_grid(grid_df)
if selected_row:
# Find the full record in the original DataFrame using the selected ID
selected_id = selected_row['ID']
full_record = df_full[df_full['GKGRECORDID'] == selected_id].iloc[0]
# Display the graph and raw data below the grid
render_graph(full_record)
render_raw_data(full_record)
else:
st.info("Use the grid filters above to search and select a record.")
else:
st.warning("No matching records found.")
# Close database connection
con.close()
main()