File size: 4,106 Bytes
3bb5fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aafdc15
3bb5fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aafdc15
3bb5fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aafdc15
3bb5fb5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Data access module for GDELT data retrieval and filtering
"""
import duckdb
import pandas as pd

def get_gdelt_data(
    limit=10,
    tone_threshold=-7.0,
    start_date=None,
    end_date=None,
    source_filter=None,
    themes_filter=None,
    persons_filter=None,
    organizations_filter=None,
    locations_filter=None
):
    """Get filtered GDELT data from DuckDB with dynamic query parameters."""
    con = duckdb.connect(database=':memory:')

    # Create view of the dataset
    con.execute("""
        CREATE VIEW negative_tone AS (
            SELECT * 
            FROM read_parquet('hf://datasets/dwb2023/gdelt-gkg-march2020-v2@~parquet/default/negative_tone/*.parquet')
        );
    """)

    # Base query components
    base_conditions = [
        "SourceCollectionIdentifier IS NOT NULL",
        "DATE IS NOT NULL",
        "SourceCommonName IS NOT NULL",
        "DocumentIdentifier IS NOT NULL",
        "V1Counts IS NOT NULL",
        "V1Themes IS NOT NULL",
        "V1Locations IS NOT NULL",
        "V1Persons IS NOT NULL",
        "V1Organizations IS NOT NULL",
        "V2GCAM IS NOT NULL",
        "\"V2.1Quotations\" IS NOT NULL",
        "tone <= ?"
    ]
    params = [tone_threshold]
    extra_conditions = []

    # Add optional filters
    if start_date:
        extra_conditions.append("DATE >= ?")
        params.append(start_date)
    if end_date:
        extra_conditions.append("DATE <= ?")
        params.append(end_date)
    if source_filter:
        extra_conditions.append("SourceCommonName ILIKE ?")
        params.append(f"%{source_filter}%")
    if themes_filter:
        extra_conditions.append("(V1Themes ILIKE ? OR V2EnhancedThemes ILIKE ?)")
        params.extend([f"%{themes_filter}%", f"%{themes_filter}%"])
    if persons_filter:
        extra_conditions.append("(V1Persons ILIKE ? OR V2EnhancedPersons ILIKE ?)")
        params.extend([f"%{persons_filter}%", f"%{persons_filter}%"])
    if organizations_filter:
        extra_conditions.append("(V1Organizations ILIKE ? OR V2EnhancedOrganizations ILIKE ?)")
        params.extend([f"%{organizations_filter}%", f"%{organizations_filter}%"])
    if locations_filter:
        extra_conditions.append("(V1Locations ILIKE ? OR V2EnhancedLocations ILIKE ?)")
        params.extend([f"%{locations_filter}%", f"%{locations_filter}%"])

    # Combine all conditions
    all_conditions = base_conditions + extra_conditions
    where_clause = " AND ".join(all_conditions) if all_conditions else "1=1"

    # Build final query
    query = f"""
        SELECT *
        FROM negative_tone
        WHERE {where_clause}
        LIMIT ?;
    """
    params.append(limit)

    # Execute query with parameters
    results_df = con.execute(query, params).fetchdf()
    con.close()
    
    return results_df

def filter_dataframe(df, source_filter=None, date_filter=None, tone_min=None, tone_max=None):
    """Filter dataframe based on provided criteria"""
    display_df = df[['GKGRECORDID', 'DATE', 'SourceCommonName', 'tone']].copy()
    display_df.columns = ['ID', 'Date', 'Source', 'Tone']

    if source_filter:
        display_df = display_df[display_df['Source'].str.contains(source_filter, case=False, na=False)]
    if date_filter:
        display_df = display_df[display_df['Date'].str.contains(date_filter, na=False)]
    if tone_min is not None and tone_max is not None:
        display_df = display_df[
            (display_df['Tone'] >= tone_min) & 
            (display_df['Tone'] <= tone_max)
        ]

    return display_df

# Constants for raw data categories
GDELT_CATEGORIES = {
    "Metadata": ["GKGRECORDID", "DATE", "SourceCommonName", "DocumentIdentifier", "V2.1Quotations", "tone"],
    "Persons": ["V2EnhancedPersons", "V1Persons"],
    "Organizations": ["V2EnhancedOrganizations", "V1Organizations"],
    "Locations": ["V2EnhancedLocations", "V1Locations"],
    "Themes": ["V2EnhancedThemes", "V1Themes"],
    "Names": ["V2.1AllNames"],
    "Counts": ["V2.1Counts", "V1Counts"],
    "Amounts": ["V2.1Amounts"],
    "V2GCAM": ["V2GCAM"],
    "V2.1EnhancedDates": ["V2.1EnhancedDates"],
}