File size: 7,919 Bytes
3bb5fb5 4b38188 260e16d 4b38188 3bb5fb5 4b38188 3bb5fb5 4b38188 3bb5fb5 7b1d8b6 3bb5fb5 7b1d8b6 3bb5fb5 7b1d8b6 3bb5fb5 4b38188 3bb5fb5 7b1d8b6 4b38188 3bb5fb5 4b38188 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
import streamlit as st
import duckdb
import pandas as pd
from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
from graph_builder import StLinkBuilder
# Node styles configuration
NODE_STYLES = [
NodeStyle("EVENT", "#FF7F3E", "name", "description"),
NodeStyle("PERSON", "#4CAF50", "name", "person"),
NodeStyle("NAME", "#2A629A", "name", "badge"),
NodeStyle("ORGANIZATION", "#9C27B0", "name", "business"),
NodeStyle("LOCATION", "#2196F3", "name", "place"),
NodeStyle("THEME", "#FFC107", "name", "sell"),
NodeStyle("COUNT", "#795548", "name", "inventory"),
NodeStyle("AMOUNT", "#607D8B", "name", "wallet"),
]
# Edge styles configuration
EDGE_STYLES = [
EdgeStyle("MENTIONED_IN", caption="label", directed=True),
EdgeStyle("LOCATED_IN", caption="label", directed=True),
EdgeStyle("CATEGORIZED_AS", caption="label", directed=True)
]
# Constants for raw data categories
GDELT_CATEGORIES = {
"Metadata": ["GKGRECORDID", "DATE", "SourceCommonName", "DocumentIdentifier", "V2.1Quotations", "tone"],
"Persons": ["V2EnhancedPersons", "V1Persons"],
"Organizations": ["V2EnhancedOrganizations", "V1Organizations"],
"Locations": ["V2EnhancedLocations", "V1Locations"],
"Themes": ["V2EnhancedThemes", "V1Themes"],
"Names": ["V2.1AllNames"],
"Counts": ["V2.1Counts", "V1Counts"],
"Amounts": ["V2.1Amounts"],
"V2GCAM": ["V2GCAM"],
"V2.1EnhancedDates": ["V2.1EnhancedDates"],
}
def initialize_db():
"""Initialize database connection and create dataset view"""
con = duckdb.connect()
con.execute("""
CREATE VIEW negative_tone AS (
SELECT *
FROM read_parquet('hf://datasets/dwb2023/gdelt-gkg-march2020-v2@~parquet/default/negative_tone/*.parquet')
);
""")
return con
def fetch_data(con, source_filter=None,
start_date=None, end_date=None, limit=50, include_all_columns=False):
"""Fetch filtered data from the database"""
if include_all_columns:
columns = "*"
else:
columns = "GKGRECORDID, DATE, SourceCommonName, tone, DocumentIdentifier, 'V2.1Quotations', SourceCollectionIdentifier"
query = f"""
SELECT {columns}
FROM negative_tone
WHERE TRUE
"""
params = []
if source_filter:
query += " AND SourceCommonName ILIKE ?"
params.append(f"%{source_filter}%")
if start_date:
query += " AND DATE >= ?"
params.append(start_date)
if end_date:
query += " AND DATE <= ?"
params.append(end_date)
if limit:
query += f" LIMIT {limit}"
try:
result = con.execute(query, params)
return result.fetchdf()
except Exception as e:
st.error(f"Query execution failed: {str(e)}")
return pd.DataFrame()
def render_data_grid(df):
"""
Render an interactive data grid (with builtโin filtering) and return the selected row.
The grid is configured to show only the desired columns (ID, Date, Source, Tone)
and allow filtering/search on each.
"""
st.subheader("Search and Filter Records")
# Build grid options with AgGrid
gb = GridOptionsBuilder.from_dataframe(df)
gb.configure_default_column(filter=True, sortable=True, resizable=True)
# Enable single row selection
gb.configure_selection('single', use_checkbox=False)
grid_options = gb.build()
# Render AgGrid (the grid will have a filter field for each column)
grid_response = AgGrid(
df,
gridOptions=grid_options,
update_mode=GridUpdateMode.SELECTION_CHANGED,
height=400,
fit_columns_on_grid_load=True
)
selected = grid_response.get('selected_rows')
if selected is not None:
# If selected is a DataFrame, use iloc to get the first row.
if isinstance(selected, pd.DataFrame):
if not selected.empty:
return selected.iloc[0].to_dict()
# Otherwise, if it's a list, get the first element.
elif isinstance(selected, list) and len(selected) > 0:
return selected[0]
return None
def render_graph(record):
"""
Render a graph visualization for the selected record.
Uses StLinkBuilder to convert the record into graph format and then
displays the graph using st_link_analysis.
"""
st.subheader(f"Event Graph: {record.get('GKGRECORDID', 'Unknown')}")
stlink_builder = StLinkBuilder()
# Convert the record (a Series) into a DataFrame with one row
record_df = pd.DataFrame([record])
graph_data = stlink_builder.build_graph(record_df)
return st_link_analysis(
elements=graph_data,
layout="fcose", # Column configuration for data grid - cose, fcose, breadthfirst, cola
node_styles=NODE_STYLES,
edge_styles=EDGE_STYLES
)
def render_raw_data(record):
"""Render raw GDELT data in expandable sections."""
st.header("Full Record Details")
for category, fields in GDELT_CATEGORIES.items():
with st.expander(f"{category}"):
for field in fields:
if field in record:
st.markdown(f"**{field}:**")
st.text(record[field])
st.divider()
def main():
st.title("๐ COVID Event Graph Explorer")
st.markdown("""
**Interactive Event Graph Viewer**
Filter and select individual COVID-19 event records to display their detailed graph representations. Analyze relationships between events and associated entities using the interactive graph below.
""")
# Initialize database connection using context manager
with initialize_db() as con:
if con is not None:
# Add UI components
# Sidebar controls
with st.sidebar:
st.header("Search Filters")
source = st.text_input("Filter by source name")
start_date = st.text_input("Start date (YYYYMMDD)", "20200314")
end_date = st.text_input("End date (YYYYMMDD)", "20200315")
limit = st.slider("Number of results to display", 10, 500, 100)
# Fetch initial data view
df_initial = fetch_data(
con=con,
source_filter=source,
start_date=start_date,
end_date=end_date,
limit=limit,
include_all_columns=False
)
# Fetch full records for selection
df_full = fetch_data(
con=con,
source_filter=source,
start_date=start_date,
end_date=end_date,
limit=limit,
include_all_columns=True
)
# Create a DataFrame for the grid with only the key columns
grid_df = df_initial[['GKGRECORDID', 'DATE', 'SourceCommonName', 'tone', 'DocumentIdentifier', 'SourceCollectionIdentifier']].copy()
grid_df.columns = ['ID', 'Date', 'Source', 'Tone', 'Doc ID', 'Source Collection ID']
# Render the interactive data grid at the top
selected_row = render_data_grid(grid_df)
if selected_row:
# Find the full record in the original DataFrame using the selected ID
selected_id = selected_row['ID']
full_record = df_full[df_full['GKGRECORDID'] == selected_id].iloc[0]
# Display the graph and raw data below the grid
render_graph(full_record)
render_raw_data(full_record)
else:
st.info("Use the grid filters above to search and select a record.")
else:
st.warning("No matching records found.")
# Close database connection
con.close()
main()
|