File size: 7,919 Bytes
3bb5fb5
 
 
 
4b38188
 
 
 
 
 
 
260e16d
4b38188
 
 
 
 
 
 
 
 
 
 
 
 
3bb5fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b38188
3bb5fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b38188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bb5fb5
 
 
 
 
 
 
 
 
 
 
 
7b1d8b6
3bb5fb5
7b1d8b6
3bb5fb5
7b1d8b6
3bb5fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b38188
 
 
 
3bb5fb5
 
7b1d8b6
4b38188
3bb5fb5
 
 
 
 
 
4b38188
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import streamlit as st
import duckdb
import pandas as pd
from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle
from graph_builder import StLinkBuilder

# Node styles configuration
NODE_STYLES = [
    NodeStyle("EVENT", "#FF7F3E", "name", "description"),
    NodeStyle("PERSON", "#4CAF50", "name", "person"),
    NodeStyle("NAME", "#2A629A", "name", "badge"),
    NodeStyle("ORGANIZATION", "#9C27B0", "name", "business"),
    NodeStyle("LOCATION", "#2196F3", "name", "place"),
    NodeStyle("THEME", "#FFC107", "name", "sell"),
    NodeStyle("COUNT", "#795548", "name", "inventory"),
    NodeStyle("AMOUNT", "#607D8B", "name", "wallet"),
]

# Edge styles configuration
EDGE_STYLES = [
    EdgeStyle("MENTIONED_IN", caption="label", directed=True),
    EdgeStyle("LOCATED_IN", caption="label", directed=True),
    EdgeStyle("CATEGORIZED_AS", caption="label", directed=True)
]

# Constants for raw data categories
GDELT_CATEGORIES = {
    "Metadata": ["GKGRECORDID", "DATE", "SourceCommonName", "DocumentIdentifier", "V2.1Quotations", "tone"],
    "Persons": ["V2EnhancedPersons", "V1Persons"],
    "Organizations": ["V2EnhancedOrganizations", "V1Organizations"],
    "Locations": ["V2EnhancedLocations", "V1Locations"],
    "Themes": ["V2EnhancedThemes", "V1Themes"],
    "Names": ["V2.1AllNames"],
    "Counts": ["V2.1Counts", "V1Counts"],
    "Amounts": ["V2.1Amounts"],
    "V2GCAM": ["V2GCAM"],
    "V2.1EnhancedDates": ["V2.1EnhancedDates"],
}

def initialize_db():
    """Initialize database connection and create dataset view"""
    con = duckdb.connect()
    con.execute("""
        CREATE VIEW negative_tone AS (
            SELECT * 
            FROM read_parquet('hf://datasets/dwb2023/gdelt-gkg-march2020-v2@~parquet/default/negative_tone/*.parquet')
        );
    """)
    return con

def fetch_data(con, source_filter=None,
               start_date=None, end_date=None, limit=50, include_all_columns=False):
    """Fetch filtered data from the database"""
    if include_all_columns:
        columns = "*"
    else:
        columns = "GKGRECORDID, DATE, SourceCommonName, tone, DocumentIdentifier, 'V2.1Quotations', SourceCollectionIdentifier"
    
    query = f"""
        SELECT {columns}
        FROM negative_tone 
        WHERE TRUE
    """
    params = []
    
    if source_filter:
        query += " AND SourceCommonName ILIKE ?"
        params.append(f"%{source_filter}%")
    if start_date:
        query += " AND DATE >= ?"
        params.append(start_date)
    if end_date:
        query += " AND DATE <= ?"
        params.append(end_date)
    if limit:
        query += f" LIMIT {limit}"
    
    try:
        result = con.execute(query, params)
        return result.fetchdf()
    except Exception as e:
        st.error(f"Query execution failed: {str(e)}")
        return pd.DataFrame()

def render_data_grid(df):
    """
    Render an interactive data grid (with builtโ€‘in filtering) and return the selected row.
    The grid is configured to show only the desired columns (ID, Date, Source, Tone)
    and allow filtering/search on each.
    """
    st.subheader("Search and Filter Records")
    
    # Build grid options with AgGrid
    gb = GridOptionsBuilder.from_dataframe(df)
    gb.configure_default_column(filter=True, sortable=True, resizable=True)
    # Enable single row selection
    gb.configure_selection('single', use_checkbox=False)
    grid_options = gb.build()

    # Render AgGrid (the grid will have a filter field for each column)
    grid_response = AgGrid(
        df,
        gridOptions=grid_options,
        update_mode=GridUpdateMode.SELECTION_CHANGED,
        height=400,
        fit_columns_on_grid_load=True
    )
    
    selected = grid_response.get('selected_rows')
    if selected is not None:
        # If selected is a DataFrame, use iloc to get the first row.
        if isinstance(selected, pd.DataFrame):
            if not selected.empty:
                return selected.iloc[0].to_dict()
        # Otherwise, if it's a list, get the first element.
        elif isinstance(selected, list) and len(selected) > 0:
            return selected[0]
    return None

def render_graph(record):
    """
    Render a graph visualization for the selected record.
    Uses StLinkBuilder to convert the record into graph format and then
    displays the graph using st_link_analysis.
    """
    st.subheader(f"Event Graph: {record.get('GKGRECORDID', 'Unknown')}")
    stlink_builder = StLinkBuilder()
    # Convert the record (a Series) into a DataFrame with one row
    record_df = pd.DataFrame([record])
    graph_data = stlink_builder.build_graph(record_df)
    return st_link_analysis(
        elements=graph_data,
        layout="fcose", # Column configuration for data grid - cose, fcose, breadthfirst, cola
        node_styles=NODE_STYLES,
        edge_styles=EDGE_STYLES
    )

def render_raw_data(record):
    """Render raw GDELT data in expandable sections."""
    st.header("Full Record Details")
    for category, fields in GDELT_CATEGORIES.items():
        with st.expander(f"{category}"):
            for field in fields:
                if field in record:
                    st.markdown(f"**{field}:**")
                    st.text(record[field])
                    st.divider()

def main():
    st.title("๐Ÿ” COVID Event Graph Explorer")
    st.markdown("""
    **Interactive Event Graph Viewer**

    Filter and select individual COVID-19 event records to display their detailed graph representations. Analyze relationships between events and associated entities using the interactive graph below.
    """)

    # Initialize database connection using context manager
    with initialize_db() as con:
        if con is not None:
            # Add UI components

            # Sidebar controls
            with st.sidebar:
                st.header("Search Filters")
                source = st.text_input("Filter by source name")
                start_date = st.text_input("Start date (YYYYMMDD)", "20200314")
                end_date = st.text_input("End date (YYYYMMDD)", "20200315")
                limit = st.slider("Number of results to display", 10, 500, 100)
            
            # Fetch initial data view
            df_initial = fetch_data(
                con=con,
                source_filter=source,
                start_date=start_date,
                end_date=end_date,
                limit=limit,
                include_all_columns=False
            )
            
            # Fetch full records for selection
            df_full = fetch_data(
                con=con,
                source_filter=source,
                start_date=start_date,
                end_date=end_date,
                limit=limit,
                include_all_columns=True
            )

            # Create a DataFrame for the grid with only the key columns
            grid_df = df_initial[['GKGRECORDID', 'DATE', 'SourceCommonName', 'tone', 'DocumentIdentifier', 'SourceCollectionIdentifier']].copy()
            grid_df.columns = ['ID', 'Date', 'Source', 'Tone', 'Doc ID', 'Source Collection ID']
            
            # Render the interactive data grid at the top
            selected_row = render_data_grid(grid_df)

            if selected_row:
                # Find the full record in the original DataFrame using the selected ID
                selected_id = selected_row['ID']
                full_record = df_full[df_full['GKGRECORDID'] == selected_id].iloc[0]

                # Display the graph and raw data below the grid
                render_graph(full_record)

                render_raw_data(full_record)
            else:
                st.info("Use the grid filters above to search and select a record.")

        else:
            st.warning("No matching records found.")

    # Close database connection
    con.close()

main()