Spaces:

arpit13
/

Whale_Arbitrum

Build error

File size: 64,375 Bytes

011960a

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Union, Any, Tuple
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import plotly.express as px
import logging
import time

class DataProcessor:
    """
    Process and analyze transaction data from blockchain APIs
    """
    
    def __init__(self):
        pass
    
    def aggregate_transactions(self, 
                              transactions_df: pd.DataFrame, 
                              time_window: str = 'D') -> pd.DataFrame:
        """
        Aggregate transactions by time window
        
        Args:
            transactions_df: DataFrame of transactions
            time_window: Time window for aggregation (e.g., 'D' for day, 'H' for hour)
            
        Returns:
            Aggregated DataFrame with transaction counts and volumes
        """
        if transactions_df.empty:
            return pd.DataFrame()
        
        # Ensure timestamp column is datetime
        if 'Timestamp' in transactions_df.columns:
            timestamp_col = 'Timestamp'
        elif 'timeStamp' in transactions_df.columns:
            timestamp_col = 'timeStamp'
        else:
            raise ValueError("Timestamp column not found in transactions DataFrame")
        
        # Ensure amount column exists
        if 'Amount' in transactions_df.columns:
            amount_col = 'Amount'
        elif 'tokenAmount' in transactions_df.columns:
            amount_col = 'tokenAmount'
        elif 'value' in transactions_df.columns:
            # Try to adjust for decimals if 'tokenDecimal' exists
            if 'tokenDecimal' in transactions_df.columns:
                transactions_df['adjustedValue'] = transactions_df['value'].astype(float) / (10 ** transactions_df['tokenDecimal'].astype(int))
                amount_col = 'adjustedValue'
            else:
                amount_col = 'value'
        else:
            raise ValueError("Amount column not found in transactions DataFrame")
        
        # Resample by time window
        transactions_df = transactions_df.copy()
        try:
            transactions_df.set_index(pd.DatetimeIndex(transactions_df[timestamp_col]), inplace=True)
        except Exception as e:
            print(f"Error setting DatetimeIndex: {str(e)}")
            # Create a safe index as a fallback
            transactions_df['safe_timestamp'] = pd.date_range(
                start='2025-01-01', 
                periods=len(transactions_df), 
                freq='H'
            )
            transactions_df.set_index('safe_timestamp', inplace=True)
        
        # Identify buy vs sell transactions based on 'from' and 'to' addresses
        if 'From' in transactions_df.columns and 'To' in transactions_df.columns:
            from_col, to_col = 'From', 'To'
        elif 'from' in transactions_df.columns and 'to' in transactions_df.columns:
            from_col, to_col = 'from', 'to'
        else:
            # If we can't determine direction, just aggregate total volume
            agg_df = transactions_df.resample(time_window).agg({
                amount_col: 'sum',
                timestamp_col: 'count'
            })
            agg_df.columns = ['Volume', 'Count']
            return agg_df.reset_index()
        
        # Calculate net flow for each wallet address (positive = inflow, negative = outflow)
        wallet_addresses = set(transactions_df[from_col].unique()) | set(transactions_df[to_col].unique())
        
        results = []
        for wallet in wallet_addresses:
            wallet_df = transactions_df.copy()
            
            # Mark transactions as inflow or outflow
            wallet_df['Direction'] = 'Unknown'
            wallet_df.loc[wallet_df[to_col] == wallet, 'Direction'] = 'In'
            wallet_df.loc[wallet_df[from_col] == wallet, 'Direction'] = 'Out'
            
            # Calculate net flow
            wallet_df['NetFlow'] = wallet_df[amount_col]
            wallet_df.loc[wallet_df['Direction'] == 'Out', 'NetFlow'] = -wallet_df.loc[wallet_df['Direction'] == 'Out', amount_col]
            
            # Aggregate by time window
            wallet_agg = wallet_df.resample(time_window).agg({
                'NetFlow': 'sum',
                timestamp_col: 'count'
            })
            wallet_agg.columns = ['NetFlow', 'Count']
            wallet_agg['Wallet'] = wallet
            
            results.append(wallet_agg.reset_index())
        
        if not results:
            return pd.DataFrame()
            
        combined_df = pd.concat(results, ignore_index=True)
        return combined_df
    
    # Cache for pattern identification to avoid repeating expensive calculations
    _pattern_cache = {}
    
    def identify_patterns(self, 
                          transactions_df: pd.DataFrame, 
                          n_clusters: int = 3) -> List[Dict[str, Any]]:
        """
        Identify trading patterns using clustering algorithms
        
        Args:
            transactions_df: DataFrame of transactions
            n_clusters: Number of clusters to identify
            
        Returns:
            List of pattern dictionaries containing name, description, and confidence
        """
        # Check for empty data early to avoid processing
        if transactions_df.empty:
            return []
        
        # Create a cache key based on DataFrame hash and number of clusters
        try:
            cache_key = f"{hash(tuple(transactions_df.columns))}_{len(transactions_df)}_{n_clusters}"
            
            # Check cache first
            if cache_key in self._pattern_cache:
                return self._pattern_cache[cache_key]
        except Exception:
            # If hashing fails, proceed without caching
            cache_key = None
            
        try:
            # Create a reference instead of a deep copy to improve memory usage
            df = transactions_df
            
            # Ensure timestamp column exists - optimize column presence checks
            timestamp_cols = ['Timestamp', 'timeStamp']
            timestamp_col = next((col for col in timestamp_cols if col in df.columns), None)
            
            if timestamp_col:
                # Convert timestamp only if needed
                if not pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
                    try:
                        # Use vectorized operations instead of astype where possible
                        if df[timestamp_col].dtype == 'object':
                            df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')
                        else:
                            df[timestamp_col] = pd.to_datetime(df[timestamp_col], unit='s', errors='coerce')
                    except Exception as e:
                        # Create a date range index as fallback
                        df['dummy_timestamp'] = pd.date_range(start='2025-01-01', periods=len(df), freq='H')
                        timestamp_col = 'dummy_timestamp'
            else:
                # If no timestamp column, create a dummy index
                df['dummy_timestamp'] = pd.date_range(start='2025-01-01', periods=len(df), freq='H')
                timestamp_col = 'dummy_timestamp'
                
            # Efficiently calculate floor hour using vectorized operations
            df['hour'] = df[timestamp_col].dt.floor('H')
            
            # Check for address columns efficiently
            if 'From' in df.columns and 'To' in df.columns:
                from_col, to_col = 'From', 'To'
            elif 'from' in df.columns and 'to' in df.columns:
                from_col, to_col = 'from', 'to'
            else:
                # Create dummy addresses only if necessary
                df['from'] = [f'0x{i:040x}' for i in range(len(df))]
                df['to'] = [f'0x{(i+1):040x}' for i in range(len(df))]
                from_col, to_col = 'from', 'to'
            
            # Efficiently determine amount column
            amount_cols = ['Amount', 'tokenAmount', 'value', 'adjustedValue']
            amount_col = next((col for col in amount_cols if col in df.columns), None)
            
            if not amount_col:
                # Handle special case for token values with decimals
                if 'value' in df.columns and 'tokenDecimal' in df.columns:
                    # Vectorized calculation for improved performance
                    try:
                        # Ensure values are numeric
                        df['value_numeric'] = pd.to_numeric(df['value'], errors='coerce')
                        df['tokenDecimal_numeric'] = pd.to_numeric(df['tokenDecimal'], errors='coerce').fillna(18)
                        df['adjustedValue'] = df['value_numeric'] / (10 ** df['tokenDecimal_numeric'])
                        amount_col = 'adjustedValue'
                    except Exception as e:
                        logging.warning(f"Error converting values: {e}")
                        df['dummy_amount'] = 1.0
                        amount_col = 'dummy_amount'
                else:
                    # Fallback to dummy values
                    df['dummy_amount'] = 1.0
                    amount_col = 'dummy_amount'
                    
            # Ensure the amount column is numeric
            try:
                if amount_col in df.columns:
                    df[f"{amount_col}_numeric"] = pd.to_numeric(df[amount_col], errors='coerce').fillna(0)
                    amount_col = f"{amount_col}_numeric"
            except Exception:
                # If conversion fails, create a dummy numeric column
                df['safe_amount'] = 1.0
                amount_col = 'safe_amount'
            
            # Calculate metrics using optimized groupby operations
            # Use a more efficient approach with built-in pandas aggregation
            agg_df = df.groupby('hour').agg(
                Count=pd.NamedAgg(column=from_col, aggfunc='count'),
            ).reset_index()
            
            # For NetFlow calculation, we need an additional pass
            # This uses a more efficient calculation method
            def calc_netflow(group):
                # Use optimized filtering and calculations for better performance
                first_to = group[to_col].iloc[0] if len(group) > 0 else None
                first_from = group[from_col].iloc[0] if len(group) > 0 else None
                
                if first_to is not None and first_from is not None:
                    # Ensure values are converted to numeric before summing
                    try:
                        # Convert to numeric with pd.to_numeric, coerce errors to NaN
                        total_in = pd.to_numeric(group.loc[group[to_col] == first_to, amount_col], errors='coerce').sum()
                        total_out = pd.to_numeric(group.loc[group[from_col] == first_from, amount_col], errors='coerce').sum()
                        # Replace NaN with 0 to avoid propagation
                        if pd.isna(total_in): total_in = 0.0
                        if pd.isna(total_out): total_out = 0.0
                        return float(total_in) - float(total_out)
                    except Exception as e:
                        import logging
                        logging.debug(f"Error converting values to numeric: {e}")
                        return 0.0
                return 0.0
            
            # Calculate NetFlow using apply instead of loop
            netflows = df.groupby('hour').apply(calc_netflow)
            agg_df['NetFlow'] = netflows.values
            
            # Early return if not enough data for clustering
            if agg_df.empty or len(agg_df) < n_clusters:
                return []
            
            # Ensure we don't have too many clusters for the dataset
            actual_n_clusters = min(n_clusters, max(2, len(agg_df) // 2))
            
            # Prepare features for clustering - with careful type handling
            try:
                if 'NetFlow' in agg_df.columns:
                    # Ensure NetFlow is numeric
                    agg_df['NetFlow'] = pd.to_numeric(agg_df['NetFlow'], errors='coerce').fillna(0)
                    features = agg_df[['NetFlow', 'Count']].copy()
                    primary_metric = 'NetFlow'
                else:
                    # Calculate Volume if needed
                    if 'Volume' not in agg_df.columns and amount_col in df.columns:
                        # Calculate volume with numeric conversion
                        volume_by_hour = pd.to_numeric(df[amount_col], errors='coerce').fillna(0).groupby(df['hour']).sum()
                        agg_df['Volume'] = agg_df['hour'].map(volume_by_hour)
                    
                    # Ensure Volume exists and is numeric
                    if 'Volume' not in agg_df.columns:
                        agg_df['Volume'] = 1.0  # Default value if calculation failed
                    else:
                        agg_df['Volume'] = pd.to_numeric(agg_df['Volume'], errors='coerce').fillna(1.0)
                        
                    # Ensure Count is numeric
                    agg_df['Count'] = pd.to_numeric(agg_df['Count'], errors='coerce').fillna(1.0)
                    
                    features = agg_df[['Volume', 'Count']].copy()
                    primary_metric = 'Volume'
                    
                # Final check to ensure features are numeric
                for col in features.columns:
                    features[col] = pd.to_numeric(features[col], errors='coerce').fillna(0)
            except Exception as e:
                logging.warning(f"Error preparing clustering features: {e}")
                # Create safe dummy features if everything else fails
                agg_df['SafeFeature'] = 1.0
                agg_df['Count'] = 1.0 
                features = agg_df[['SafeFeature', 'Count']].copy()
                primary_metric = 'SafeFeature'
            
            # Scale features - import only when needed for efficiency
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            scaled_features = scaler.fit_transform(features)
            
            # Use K-Means with reduced complexity
            from sklearn.cluster import KMeans
            kmeans = KMeans(n_clusters=actual_n_clusters, random_state=42, n_init=10, max_iter=100)
            agg_df['Cluster'] = kmeans.fit_predict(scaled_features)
            
            # Calculate time-based metrics from the hour column directly
            if 'hour' in agg_df.columns:
                try:
                    # Convert to datetime for hour and day extraction if needed
                    hour_series = pd.to_datetime(agg_df['hour'])
                    agg_df['Hour'] = hour_series.dt.hour
                    agg_df['Day'] = hour_series.dt.dayofweek
                except Exception:
                    # Fallback for non-convertible data
                    agg_df['Hour'] = 0
                    agg_df['Day'] = 0
            else:
                # Default values if no hour column
                agg_df['Hour'] = 0
                agg_df['Day'] = 0
            
            # Identify patterns efficiently
            patterns = []
            for i in range(actual_n_clusters):
                # Use boolean indexing for better performance
                cluster_mask = agg_df['Cluster'] == i
                cluster_df = agg_df[cluster_mask]
                
                if len(cluster_df) == 0:
                    continue
                    
                if primary_metric == 'NetFlow':
                    # Use numpy methods for faster calculation
                    avg_flow = cluster_df['NetFlow'].mean()
                    flow_std = cluster_df['NetFlow'].std()
                    behavior = "Accumulation" if avg_flow > 0 else "Distribution"
                    volume_metric = f"Net Flow: {avg_flow:.2f} ± {flow_std:.2f}"
                else:
                    # Use Volume metrics - optimize to avoid redundant calculations
                    avg_volume = cluster_df['Volume'].mean() if 'Volume' in cluster_df else 0
                    volume_std = cluster_df['Volume'].std() if 'Volume' in cluster_df else 0
                    behavior = "High Volume" if 'Volume' in agg_df and avg_volume > agg_df['Volume'].mean() else "Low Volume"
                    volume_metric = f"Volume: {avg_volume:.2f} ± {volume_std:.2f}"
                
                # Pattern characteristics
                pattern_metrics = {
                    "avg_flow": avg_flow,
                    "flow_std": flow_std,
                    "avg_count": cluster_df['Count'].mean(),
                    "max_flow": cluster_df['NetFlow'].max(),
                    "min_flow": cluster_df['NetFlow'].min(),
                    "common_hour": cluster_df['Hour'].mode()[0] if not cluster_df['Hour'].empty else None,
                    "common_day": cluster_df['Day'].mode()[0] if not cluster_df['Day'].empty else None
                }
                
                # Enhanced confidence calculation
                if primary_metric == 'NetFlow':
                    # Calculate within-cluster variance as a percentage of total variance
                    cluster_variance = cluster_df['NetFlow'].var()
                    total_variance = agg_df['NetFlow'].var() or 1  # Avoid division by zero
                    confidence = max(0.4, min(0.95, 1 - (cluster_variance / total_variance)))
                else:
                    # Calculate within-cluster variance as a percentage of total variance
                    cluster_variance = cluster_df['Volume'].var()
                    total_variance = agg_df['Volume'].var() or 1  # Avoid division by zero
                    confidence = max(0.4, min(0.95, 1 - (cluster_variance / total_variance)))
                
                # Create enhanced pattern charts - Main Chart
                if primary_metric == 'NetFlow':
                    main_fig = px.scatter(cluster_df, x=cluster_df.index, y='NetFlow', 
                                    size='Count', color='Cluster',
                                    title=f"Pattern {i+1}: {behavior}",
                                    labels={'NetFlow': 'Net Token Flow', 'index': 'Time'},
                                    color_discrete_sequence=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])
                    
                    # Add a trend line
                    main_fig.add_trace(go.Scatter(
                        x=cluster_df.index,
                        y=cluster_df['NetFlow'].rolling(window=3, min_periods=1).mean(),
                        mode='lines',
                        name='Trend',
                        line=dict(width=2, dash='dash', color='rgba(0,0,0,0.5)')
                    ))
                    
                    # Add a zero reference line
                    main_fig.add_shape(
                        type="line",
                        x0=cluster_df.index.min(),
                        y0=0,
                        x1=cluster_df.index.max(),
                        y1=0,
                        line=dict(color="red", width=1, dash="dot"),
                    )
                else:
                    main_fig = px.scatter(cluster_df, x=cluster_df.index, y='Volume', 
                                    size='Count', color='Cluster',
                                    title=f"Pattern {i+1}: {behavior}",
                                    labels={'Volume': 'Transaction Volume', 'index': 'Time'},
                                    color_discrete_sequence=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])
                    
                    # Add a trend line
                    main_fig.add_trace(go.Scatter(
                        x=cluster_df.index,
                        y=cluster_df['Volume'].rolling(window=3, min_periods=1).mean(),
                        mode='lines',
                        name='Trend',
                        line=dict(width=2, dash='dash', color='rgba(0,0,0,0.5)')
                    ))
                
                main_fig.update_layout(
                    template="plotly_white",
                    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
                    margin=dict(l=20, r=20, t=50, b=20),
                    height=400
                )
                
                # Create hourly distribution chart
                hour_counts = cluster_df.groupby('Hour')['Count'].sum().reindex(range(24), fill_value=0)
                hour_fig = px.bar(x=hour_counts.index, y=hour_counts.values,
                                title="Hourly Distribution",
                                labels={'x': 'Hour of Day', 'y': 'Transaction Count'},
                                color_discrete_sequence=['#1f77b4'])
                hour_fig.update_layout(template="plotly_white", height=300)
                
                # Create volume/flow distribution chart
                if primary_metric == 'NetFlow':
                    hist_data = cluster_df['NetFlow']
                    hist_title = "Net Flow Distribution"
                    hist_label = "Net Flow"
                else:
                    hist_data = cluster_df['Volume']
                    hist_title = "Volume Distribution"
                    hist_label = "Volume"
                    
                dist_fig = px.histogram(hist_data, 
                                      title=hist_title,
                                      labels={'value': hist_label, 'count': 'Frequency'},
                                      color_discrete_sequence=['#2ca02c'])
                dist_fig.update_layout(template="plotly_white", height=300)
                
                # Find related transactions
                if not transactions_df.empty:
                    # Get timestamps from this cluster
                    cluster_times = pd.to_datetime(cluster_df.index)
                    # Create time windows for matching
                    time_windows = [(t - pd.Timedelta(hours=1), t + pd.Timedelta(hours=1)) for t in cluster_times]
                    
                    # Find transactions within these time windows
                    pattern_txs = transactions_df[transactions_df[timestamp_col].apply(
                        lambda x: any((start <= x <= end) for start, end in time_windows)
                    )].copy()
                    
                    # If we have too many, sample them
                    if len(pattern_txs) > 10:
                        pattern_txs = pattern_txs.sample(10)
                        
                    # If we have too few, just sample from all transactions
                    if len(pattern_txs) < 5 and len(transactions_df) >= 5:
                        pattern_txs = transactions_df.sample(min(5, len(transactions_df)))
                else:
                    pattern_txs = pd.DataFrame()
                
                # Comprehensive pattern dictionary
                pattern = {
                    "name": behavior,
                    "description": f"This pattern shows {behavior.lower()} activity.",
                    "strategy": "Unknown",
                    "risk_profile": "Unknown",
                    "time_insight": "Unknown",
                    "cluster_id": i,
                    "metrics": pattern_metrics,
                    "occurrence_count": len(cluster_df),
                    "volume_metric": volume_metric,
                    "confidence": confidence,
                    "impact": 0.0,
                    "charts": {
                        "main": main_fig,
                        "hourly_distribution": hour_fig,
                        "value_distribution": dist_fig
                    },
                    "examples": pattern_txs
                }
                
                patterns.append(pattern)
            
            # Cache results for future reuse
            if cache_key:
                self._pattern_cache[cache_key] = patterns
                
            return patterns
        
        except Exception as e:
            import logging
            logging.warning(f"Error during pattern identification: {str(e)}")
            return []

    # Create enhanced pattern detection method with visualization capabilities
            if primary_metric == 'NetFlow':
                main_fig = px.scatter(cluster_df, x=cluster_df.index, y='NetFlow', 
                                size='Count', color='Cluster',
                                title=f"Pattern {i+1}: {behavior}",
                                labels={'NetFlow': 'Net Token Flow', 'index': 'Time'},
                                color_discrete_sequence=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])
                
                # Add a trend line
                main_fig.add_trace(go.Scatter(
                    x=cluster_df.index,
                    y=cluster_df['NetFlow'].rolling(window=3, min_periods=1).mean(),
                    mode='lines',
                    name='Trend',
                    line=dict(width=2, dash='dash', color='rgba(0,0,0,0.5)')
                ))
                
                # Add a zero reference line
                main_fig.add_shape(
                    type="line",
                    x0=cluster_df.index.min(),
                    y0=0,
                    x1=cluster_df.index.max(),
                    y1=0,
                    line=dict(color="red", width=1, dash="dot"),
                )
            else:
                main_fig = px.scatter(cluster_df, x=cluster_df.index, y='Volume', 
                                size='Count', color='Cluster',
                                title=f"Pattern {i+1}: {behavior}",
                                labels={'Volume': 'Transaction Volume', 'index': 'Time'},
                                color_discrete_sequence=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])
                
                # Add a trend line
                main_fig.add_trace(go.Scatter(
                    x=cluster_df.index,
                    y=cluster_df['Volume'].rolling(window=3, min_periods=1).mean(),
                    mode='lines',
                    name='Trend',
                    line=dict(width=2, dash='dash', color='rgba(0,0,0,0.5)')
                ))
            
            main_fig.update_layout(
                template="plotly_white",
                legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
                margin=dict(l=20, r=20, t=50, b=20),
                height=400
            )
            
            # Create hourly distribution chart
            hour_counts = cluster_df.groupby('Hour')['Count'].sum().reindex(range(24), fill_value=0)
            hour_fig = px.bar(x=hour_counts.index, y=hour_counts.values,
                            title="Hourly Distribution",
                            labels={'x': 'Hour of Day', 'y': 'Transaction Count'},
                            color_discrete_sequence=['#1f77b4'])
            hour_fig.update_layout(template="plotly_white", height=300)
            
            # Create volume/flow distribution chart
            if primary_metric == 'NetFlow':
                hist_data = cluster_df['NetFlow']
                hist_title = "Net Flow Distribution"
                hist_label = "Net Flow"
            else:
                hist_data = cluster_df['Volume']
                hist_title = "Volume Distribution"
                hist_label = "Volume"
                
            dist_fig = px.histogram(hist_data, 
                                  title=hist_title,
                                  labels={'value': hist_label, 'count': 'Frequency'},
                                  color_discrete_sequence=['#2ca02c'])
            dist_fig.update_layout(template="plotly_white", height=300)
            
            # Find related transactions
            if not transactions_df.empty:
                # Get timestamps from this cluster
                cluster_times = pd.to_datetime(cluster_df.index)
                # Create time windows for matching
                time_windows = [(t - pd.Timedelta(hours=1), t + pd.Timedelta(hours=1)) for t in cluster_times]
                
                # Find transactions within these time windows
                pattern_txs = transactions_df[transactions_df[timestamp_col].apply(
                    lambda x: any((start <= x <= end) for start, end in time_windows)
                )].copy()
                
                # If we have too many, sample them
                if len(pattern_txs) > 10:
                    pattern_txs = pattern_txs.sample(10)
                    
                # If we have too few, just sample from all transactions
                if len(pattern_txs) < 5 and len(transactions_df) >= 5:
                    pattern_txs = transactions_df.sample(min(5, len(transactions_df)))
            else:
                pattern_txs = pd.DataFrame()
            
            # Comprehensive pattern dictionary
            pattern = {
                "name": behavior,
                "description": description,
                "strategy": strategy,
                "risk_profile": risk_profile,
                "time_insight": time_insight,
                "cluster_id": i,
                "metrics": pattern_metrics,
                "occurrence_count": len(cluster_df),
                "volume_metric": volume_metric,
                "confidence": confidence,
                "charts": {
                    "main": main_fig,
                    "hourly_distribution": hour_fig,
                    "value_distribution": dist_fig
                },
                "examples": pattern_txs
            }
            
            patterns.append(pattern)
        
        return patterns
    
    def detect_anomalous_transactions(self, 
                                     transactions_df: pd.DataFrame, 
                                     sensitivity: str = "Medium") -> pd.DataFrame:
        """
        Detect anomalous transactions using statistical methods
        
        Args:
            transactions_df: DataFrame of transactions
            sensitivity: Detection sensitivity ("Low", "Medium", "High")
            
        Returns:
            DataFrame of anomalous transactions
        """
        if transactions_df.empty:
            return pd.DataFrame()
        
        # Ensure amount column exists
        if 'Amount' in transactions_df.columns:
            amount_col = 'Amount'
        elif 'tokenAmount' in transactions_df.columns:
            amount_col = 'tokenAmount'
        elif 'value' in transactions_df.columns:
            # Try to adjust for decimals if 'tokenDecimal' exists
            if 'tokenDecimal' in transactions_df.columns:
                transactions_df['adjustedValue'] = transactions_df['value'].astype(float) / (10 ** transactions_df['tokenDecimal'].astype(int))
                amount_col = 'adjustedValue'
            else:
                amount_col = 'value'
        else:
            raise ValueError("Amount column not found in transactions DataFrame")
        
        # Define sensitivity thresholds
        if sensitivity == "Low":
            z_threshold = 3.0  # Outliers beyond 3 standard deviations
        elif sensitivity == "Medium":
            z_threshold = 2.5  # Outliers beyond 2.5 standard deviations
        else:  # High
            z_threshold = 2.0  # Outliers beyond 2 standard deviations
        
        # Calculate z-score for amount
        mean_amount = transactions_df[amount_col].mean()
        std_amount = transactions_df[amount_col].std()
        
        if std_amount == 0:
            return pd.DataFrame()
            
        transactions_df['z_score'] = abs((transactions_df[amount_col] - mean_amount) / std_amount)
        
        # Flag anomalous transactions
        anomalies = transactions_df[transactions_df['z_score'] > z_threshold].copy()
        
        # Add risk level based on z-score
        anomalies['risk_level'] = 'Medium'
        anomalies.loc[anomalies['z_score'] > z_threshold * 1.5, 'risk_level'] = 'High'
        anomalies.loc[anomalies['z_score'] <= z_threshold * 1.2, 'risk_level'] = 'Low'
        
        return anomalies
    
    def analyze_price_impact(self, 
                             transactions_df: pd.DataFrame,
                             price_data: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
        """
        Analyze the price impact of transactions with enhanced visualizations
        
        Args:
            transactions_df: DataFrame of transactions
            price_data: Dictionary of price impact data for each transaction
            
        Returns:
            Dictionary with comprehensive price impact analysis and visualizations
        """
        if transactions_df.empty or not price_data:
            # Create an empty chart for the default case
            empty_fig = go.Figure()
            empty_fig.update_layout(
                title="No Price Impact Data Available",
                xaxis_title="Time",
                yaxis_title="Price Impact (%)",
                height=400,
                template="plotly_white"
            )
            empty_fig.add_annotation(
                text="No transactions found with price impact data",
                showarrow=False,
                font=dict(size=14)
            )
            
            return {
                'avg_impact_pct': 0,
                'max_impact_pct': 0,
                'min_impact_pct': 0,
                'significant_moves_count': 0,
                'total_transactions': 0,
                'charts': {
                    'main_chart': empty_fig,
                    'impact_distribution': empty_fig,
                    'cumulative_impact': empty_fig,
                    'hourly_impact': empty_fig
                },
                'transactions_with_impact': pd.DataFrame(),
                'insights': [],
                'impact_summary': "No price impact data available"
            }
        
        # Ensure timestamp column is datetime
        if 'Timestamp' in transactions_df.columns:
            timestamp_col = 'Timestamp'
        elif 'timeStamp' in transactions_df.columns:
            timestamp_col = 'timeStamp'
            # Convert timestamp to datetime if it's not already
            if not pd.api.types.is_datetime64_any_dtype(transactions_df[timestamp_col]):
                transactions_df[timestamp_col] = pd.to_datetime(transactions_df[timestamp_col], unit='s')
        else:
            raise ValueError("Timestamp column not found in transactions DataFrame")
        
        # Combine price impact data with transactions
        impact_data = []
        
        for idx, row in transactions_df.iterrows():
            tx_hash = row.get('Transaction Hash', row.get('hash', None))
            if not tx_hash or tx_hash not in price_data:
                continue
            
            tx_impact = price_data[tx_hash]
            
            if tx_impact['impact_pct'] is None:
                continue
                
            # Get token symbol if available
            token_symbol = row.get('tokenSymbol', 'Unknown')
            token_amount = row.get('value', 0)
            if 'tokenDecimal' in row:
                try:
                    token_amount = float(token_amount) / (10 ** int(row.get('tokenDecimal', 0)))
                except (ValueError, TypeError):
                    token_amount = 0
                
            impact_data.append({
                'transaction_hash': tx_hash,
                'timestamp': row[timestamp_col],
                'pre_price': tx_impact['pre_price'],
                'post_price': tx_impact['post_price'],
                'impact_pct': tx_impact['impact_pct'],
                'token_symbol': token_symbol,
                'token_amount': token_amount,
                'from': row.get('from', ''),
                'to': row.get('to', ''),
                'hour': row[timestamp_col].hour if isinstance(row[timestamp_col], pd.Timestamp) else 0
            })
        
        if not impact_data:
            # Create an empty chart for the default case
            empty_fig = go.Figure()
            empty_fig.update_layout(
                title="No Price Impact Data Available",
                xaxis_title="Time",
                yaxis_title="Price Impact (%)",
                height=400,
                template="plotly_white"
            )
            empty_fig.add_annotation(
                text="No transactions found with price impact data",
                showarrow=False,
                font=dict(size=14)
            )
            
            return {
                'avg_impact_pct': 0,
                'max_impact_pct': 0,
                'min_impact_pct': 0,
                'significant_moves_count': 0,
                'total_transactions': len(transactions_df) if not transactions_df.empty else 0,
                'charts': {
                    'main_chart': empty_fig,
                    'impact_distribution': empty_fig,
                    'cumulative_impact': empty_fig,
                    'hourly_impact': empty_fig
                },
                'transactions_with_impact': pd.DataFrame(),
                'insights': [],
                'impact_summary': "No price impact data available"
            }
            
        impact_df = pd.DataFrame(impact_data)
        
        # Calculate aggregate metrics
        avg_impact = impact_df['impact_pct'].mean()
        max_impact = impact_df['impact_pct'].max()
        min_impact = impact_df['impact_pct'].min()
        median_impact = impact_df['impact_pct'].median()
        std_impact = impact_df['impact_pct'].std()
        
        # Count significant moves (>1% impact)
        significant_threshold = 1.0
        high_impact_threshold = 3.0
        significant_moves = len(impact_df[abs(impact_df['impact_pct']) > significant_threshold])
        high_impact_moves = len(impact_df[abs(impact_df['impact_pct']) > high_impact_threshold])
        positive_impacts = len(impact_df[impact_df['impact_pct'] > 0])
        negative_impacts = len(impact_df[impact_df['impact_pct'] < 0])
        
        # Calculate cumulative impact
        impact_df = impact_df.sort_values('timestamp')
        impact_df['cumulative_impact'] = impact_df['impact_pct'].cumsum()
        
        # Generate insights
        insights = []
        
        # Market direction bias
        if avg_impact > 0.5:
            insights.append({
                "title": "Positive Price Pressure",
                "description": f"Transactions show an overall positive price impact of {avg_impact:.2f}%, suggesting accumulation or market strength."
            })
        elif avg_impact < -0.5:
            insights.append({
                "title": "Negative Price Pressure",
                "description": f"Transactions show an overall negative price impact of {avg_impact:.2f}%, suggesting distribution or market weakness."
            })
            
        # Volatility analysis
        if std_impact > 2.0:
            insights.append({
                "title": "High Market Volatility",
                "description": f"Price impact shows high volatility (std: {std_impact:.2f}%), indicating potential market manipulation or whipsaw conditions."
            })
            
        # Significant impacts
        if high_impact_moves > 0:
            insights.append({
                "title": "High Impact Transactions",
                "description": f"Detected {high_impact_moves} high-impact transactions (>{high_impact_threshold}% price change), indicating potential market-moving activity."
            })
            
        # Temporal patterns
        hourly_impact = impact_df.groupby('hour')['impact_pct'].mean()
        if len(hourly_impact) > 0:
            max_hour = hourly_impact.abs().idxmax()
            max_hour_impact = hourly_impact[max_hour]
            insights.append({
                "title": "Time-Based Pattern",
                "description": f"Highest price impact occurs around {max_hour}:00 with an average of {max_hour_impact:.2f}%."
            })
        
        # Create impact summary text
        impact_summary = f"Analysis of {len(impact_df)} price-impacting transactions shows an average impact of {avg_impact:.2f}% "
        impact_summary += f"(range: {min_impact:.2f}% to {max_impact:.2f}%). "
        impact_summary += f"Found {significant_moves} significant price moves and {high_impact_moves} high-impact transactions. "
        if positive_impacts > negative_impacts:
            impact_summary += f"There is a bias towards positive price impact ({positive_impacts} positive vs {negative_impacts} negative)."
        elif negative_impacts > positive_impacts:
            impact_summary += f"There is a bias towards negative price impact ({negative_impacts} negative vs {positive_impacts} positive)."
        else:
            impact_summary += "The price impact is balanced between positive and negative moves."
        
        # Create enhanced main visualization
        main_fig = go.Figure()
        
        # Add scatter plot for impact
        main_fig.add_trace(go.Scatter(
            x=impact_df['timestamp'],
            y=impact_df['impact_pct'],
            mode='markers+lines',
            marker=dict(
                size=impact_df['impact_pct'].abs() * 1.5 + 5,
                color=impact_df['impact_pct'],
                colorscale='RdBu_r',
                line=dict(width=1),
                symbol=['circle' if val >= 0 else 'diamond' for val in impact_df['impact_pct']]
            ),
            text=[
                f"TX: {tx[:8]}...{tx[-6:]}<br>" +
                f"Impact: {impact:.2f}%<br>" +
                f"Token: {token} ({amount:.4f})<br>" +
                f"From: {src[:6]}...{src[-4:]}<br>" +
                f"To: {dst[:6]}...{dst[-4:]}"
                for tx, impact, token, amount, src, dst in zip(
                    impact_df['transaction_hash'], 
                    impact_df['impact_pct'],
                    impact_df['token_symbol'],
                    impact_df['token_amount'],
                    impact_df['from'],
                    impact_df['to']
                )
            ],
            hovertemplate='%{text}<br>Time: %{x}<extra></extra>',
            name='Price Impact'
        ))
        
        # Add a moving average trendline
        window_size = max(3, len(impact_df) // 10)  # Dynamic window size
        if len(impact_df) >= window_size:
            impact_df['ma'] = impact_df['impact_pct'].rolling(window=window_size, min_periods=1).mean()
            main_fig.add_trace(go.Scatter(
                x=impact_df['timestamp'],
                y=impact_df['ma'],
                mode='lines',
                line=dict(width=2, color='rgba(255,165,0,0.7)'),
                name=f'Moving Avg ({window_size} period)'
            ))
        
        # Add a zero line for reference
        main_fig.add_shape(
            type='line',
            x0=impact_df['timestamp'].min(),
            y0=0,
            x1=impact_df['timestamp'].max(),
            y1=0,
            line=dict(color='gray', width=1, dash='dash')
        )
        
        # Add colored regions for significant impact
        
        # Add green band for normal price movement
        main_fig.add_shape(
            type='rect',
            x0=impact_df['timestamp'].min(),
            y0=-significant_threshold,
            x1=impact_df['timestamp'].max(),
            y1=significant_threshold,
            fillcolor='rgba(0,255,0,0.1)',
            line=dict(width=0),
            layer='below'
        )
        
        # Add warning bands for higher impact movements
        main_fig.add_shape(
            type='rect',
            x0=impact_df['timestamp'].min(),
            y0=significant_threshold,
            x1=impact_df['timestamp'].max(),
            y1=high_impact_threshold,
            fillcolor='rgba(255,255,0,0.1)',
            line=dict(width=0),
            layer='below'
        )
        
        main_fig.add_shape(
            type='rect',
            x0=impact_df['timestamp'].min(),
            y0=-high_impact_threshold,
            x1=impact_df['timestamp'].max(),
            y1=-significant_threshold,
            fillcolor='rgba(255,255,0,0.1)',
            line=dict(width=0),
            layer='below'
        )
        
        # Add high impact regions
        main_fig.add_shape(
            type='rect',
            x0=impact_df['timestamp'].min(),
            y0=high_impact_threshold,
            x1=impact_df['timestamp'].max(),
            y1=max(high_impact_threshold * 2, max_impact * 1.1),
            fillcolor='rgba(255,0,0,0.1)',
            line=dict(width=0),
            layer='below'
        )
        
        main_fig.add_shape(
            type='rect',
            x0=impact_df['timestamp'].min(),
            y0=min(high_impact_threshold * -2, min_impact * 1.1),
            x1=impact_df['timestamp'].max(),
            y1=-high_impact_threshold,
            fillcolor='rgba(255,0,0,0.1)',
            line=dict(width=0),
            layer='below'
        )
        
        main_fig.update_layout(
            title='Price Impact of Whale Transactions',
            xaxis_title='Timestamp',
            yaxis_title='Price Impact (%)',
            hovermode='closest',
            template="plotly_white",
            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
            margin=dict(l=20, r=20, t=50, b=20)
        )
        
        # Create impact distribution histogram
        dist_fig = px.histogram(
            impact_df['impact_pct'], 
            nbins=20,
            labels={'value': 'Price Impact (%)', 'count': 'Frequency'},
            title='Distribution of Price Impact',
            color_discrete_sequence=['#3366CC']
        )
        
        # Add a vertical line at the mean
        dist_fig.add_vline(x=avg_impact, line_dash="dash", line_color="red")
        dist_fig.add_annotation(x=avg_impact, y=0.85, yref="paper", text=f"Mean: {avg_impact:.2f}%",
                              showarrow=True, arrowhead=2, arrowcolor="red", ax=40)
        
        # Add a vertical line at zero
        dist_fig.add_vline(x=0, line_dash="solid", line_color="black")
        
        dist_fig.update_layout(
            template="plotly_white",
            bargap=0.1,
            height=350
        )
        
        # Create cumulative impact chart
        cumul_fig = go.Figure()
        cumul_fig.add_trace(go.Scatter(
            x=impact_df['timestamp'],
            y=impact_df['cumulative_impact'],
            mode='lines',
            fill='tozeroy',
            line=dict(width=2, color='#2ca02c'),
            name='Cumulative Impact'
        ))
        
        cumul_fig.update_layout(
            title='Cumulative Price Impact Over Time',
            xaxis_title='Timestamp',
            yaxis_title='Cumulative Price Impact (%)',
            template="plotly_white",
            height=350
        )
        
        # Create hourly impact analysis
        hourly_impact = impact_df.groupby('hour')['impact_pct'].agg(['mean', 'count', 'std']).reset_index()
        hourly_impact = hourly_impact.sort_values('hour')
        
        hour_fig = go.Figure()
        hour_fig.add_trace(go.Bar(
            x=hourly_impact['hour'],
            y=hourly_impact['mean'],
            error_y=dict(type='data', array=hourly_impact['std'], visible=True),
            marker_color=hourly_impact['mean'].apply(lambda x: 'green' if x > 0 else 'red'),
            name='Average Impact'
        ))
        
        hour_fig.update_layout(
            title='Price Impact by Hour of Day',
            xaxis_title='Hour of Day',
            yaxis_title='Average Price Impact (%)',
            template="plotly_white",
            height=350,
            xaxis=dict(tickmode='linear', tick0=0, dtick=2)
        )
        
        # Join with original transactions
        transactions_df = transactions_df.copy()
        transactions_df['Timestamp_key'] = transactions_df[timestamp_col]
        impact_df['Timestamp_key'] = impact_df['timestamp']
        
        merged_df = pd.merge(
            transactions_df, 
            impact_df[['Timestamp_key', 'impact_pct', 'pre_price', 'post_price', 'cumulative_impact']], 
            on='Timestamp_key', 
            how='left'
        )
        
        # Final result with enhanced output
        return {
            'avg_impact_pct': avg_impact,
            'max_impact_pct': max_impact,
            'min_impact_pct': min_impact,
            'median_impact_pct': median_impact,
            'std_impact_pct': std_impact,
            'significant_moves_count': significant_moves,
            'high_impact_moves_count': high_impact_moves,
            'positive_impacts_count': positive_impacts,
            'negative_impacts_count': negative_impacts,
            'total_transactions': len(transactions_df),
            'charts': {
                'main_chart': main_fig,
                'impact_distribution': dist_fig,
                'cumulative_impact': cumul_fig,
                'hourly_impact': hour_fig
            },
            'transactions_with_impact': merged_df,
            'insights': insights,
            'impact_summary': impact_summary
        }
    
    def detect_wash_trading(self, 
                           transactions_df: pd.DataFrame, 
                           addresses: List[str],
                           time_window_minutes: int = 60,
                           sensitivity: str = "Medium") -> List[Dict[str, Any]]:
        """
        Detect potential wash trading between addresses
        
        Args:
            transactions_df: DataFrame of transactions
            addresses: List of addresses to analyze
            time_window_minutes: Time window for detecting wash trades
            sensitivity: Detection sensitivity ("Low", "Medium", "High")
            
        Returns:
            List of potential wash trading incidents
        """
        if transactions_df.empty or not addresses:
            return []
        
        # Ensure from/to columns exist
        if 'From' in transactions_df.columns and 'To' in transactions_df.columns:
            from_col, to_col = 'From', 'To'
        elif 'from' in transactions_df.columns and 'to' in transactions_df.columns:
            from_col, to_col = 'from', 'to'
        else:
            raise ValueError("From/To columns not found in transactions DataFrame")
            
        # Ensure timestamp column exists
        if 'Timestamp' in transactions_df.columns:
            timestamp_col = 'Timestamp'
        elif 'timeStamp' in transactions_df.columns:
            timestamp_col = 'timeStamp'
        else:
            raise ValueError("Timestamp column not found in transactions DataFrame")
        
        # Ensure timestamp is datetime
        if not pd.api.types.is_datetime64_any_dtype(transactions_df[timestamp_col]):
            transactions_df[timestamp_col] = pd.to_datetime(transactions_df[timestamp_col])
        
        # Define sensitivity thresholds
        if sensitivity == "Low":
            min_cycles = 3  # Minimum number of back-and-forth transactions
            max_time_diff = 120  # Maximum minutes between transactions
        elif sensitivity == "Medium":
            min_cycles = 2
            max_time_diff = 60
        else:  # High
            min_cycles = 1
            max_time_diff = 30
        
        # Filter transactions involving the addresses
        address_txs = transactions_df[
            (transactions_df[from_col].isin(addresses)) | 
            (transactions_df[to_col].isin(addresses))
        ].copy()
        
        if address_txs.empty:
            return []
        
        # Sort by timestamp
        address_txs = address_txs.sort_values(by=timestamp_col)
        
        # Detect cycles of transactions between same addresses
        wash_trades = []
        
        for addr1 in addresses:
            for addr2 in addresses:
                if addr1 == addr2:
                    continue
                    
                # Find transactions from addr1 to addr2
                a1_to_a2 = address_txs[
                    (address_txs[from_col] == addr1) & 
                    (address_txs[to_col] == addr2)
                ]
                
                # Find transactions from addr2 to addr1
                a2_to_a1 = address_txs[
                    (address_txs[from_col] == addr2) & 
                    (address_txs[to_col] == addr1)
                ]
                
                if a1_to_a2.empty or a2_to_a1.empty:
                    continue
                
                # Check for back-and-forth patterns
                cycles = 0
                evidence = []
                
                for _, tx1 in a1_to_a2.iterrows():
                    tx1_time = tx1[timestamp_col]
                    
                    # Find return transactions within the time window
                    return_txs = a2_to_a1[
                        (a2_to_a1[timestamp_col] > tx1_time) & 
                        (a2_to_a1[timestamp_col] <= tx1_time + pd.Timedelta(minutes=max_time_diff))
                    ]
                    
                    if not return_txs.empty:
                        cycles += 1
                        evidence.append(tx1)
                        evidence.append(return_txs.iloc[0])
                
                if cycles >= min_cycles:
                    # Create visualization
                    if evidence:
                        evidence_df = pd.DataFrame(evidence)
                        fig = px.scatter(
                            evidence_df, 
                            x=timestamp_col, 
                            y=evidence_df.get('Amount', evidence_df.get('tokenAmount', evidence_df.get('value', 0))),
                            color=from_col,
                            title=f"Potential Wash Trading Between {addr1[:8]}... and {addr2[:8]}..."
                        )
                    else:
                        fig = None
                    
                    wash_trades.append({
                        "type": "Wash Trading",
                        "addresses": [addr1, addr2],
                        "risk_level": "High" if cycles >= min_cycles * 2 else "Medium",
                        "description": f"Detected {cycles} cycles of back-and-forth transactions between addresses",
                        "detection_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        "title": f"Wash Trading Pattern ({cycles} cycles)",
                        "evidence": pd.DataFrame(evidence) if evidence else None,
                        "chart": fig
                    })
        
        return wash_trades
    
    def detect_pump_and_dump(self, 
                            transactions_df: pd.DataFrame,
                            price_data: Dict[str, Dict[str, Any]],
                            sensitivity: str = "Medium") -> List[Dict[str, Any]]:
        """
        Detect potential pump and dump schemes
        
        Args:
            transactions_df: DataFrame of transactions
            price_data: Dictionary of price impact data for each transaction
            sensitivity: Detection sensitivity ("Low", "Medium", "High")
            
        Returns:
            List of potential pump and dump incidents
        """
        if transactions_df.empty or not price_data:
            return []
        
        # Ensure timestamp column exists
        if 'Timestamp' in transactions_df.columns:
            timestamp_col = 'Timestamp'
        elif 'timeStamp' in transactions_df.columns:
            timestamp_col = 'timeStamp'
        else:
            raise ValueError("Timestamp column not found in transactions DataFrame")
        
        # Ensure from/to columns exist
        if 'From' in transactions_df.columns and 'To' in transactions_df.columns:
            from_col, to_col = 'From', 'To'
        elif 'from' in transactions_df.columns and 'to' in transactions_df.columns:
            from_col, to_col = 'from', 'to'
        else:
            raise ValueError("From/To columns not found in transactions DataFrame")
        
        # Ensure timestamp is datetime
        if not pd.api.types.is_datetime64_any_dtype(transactions_df[timestamp_col]):
            transactions_df[timestamp_col] = pd.to_datetime(transactions_df[timestamp_col])
        
        # Define sensitivity thresholds
        if sensitivity == "Low":
            accumulation_threshold = 5  # Number of buys to consider accumulation
            pump_threshold = 10.0  # % price increase to trigger pump
            dump_threshold = -8.0  # % price decrease to trigger dump
        elif sensitivity == "Medium":
            accumulation_threshold = 3
            pump_threshold = 7.0
            dump_threshold = -5.0
        else:  # High
            accumulation_threshold = 2
            pump_threshold = 5.0
            dump_threshold = -3.0
        
        # Combine price impact data with transactions
        txs_with_impact = []
        
        for idx, row in transactions_df.iterrows():
            tx_hash = row.get('Transaction Hash', row.get('hash', None))
            if not tx_hash or tx_hash not in price_data:
                continue
            
            tx_impact = price_data[tx_hash]
            
            if tx_impact['impact_pct'] is None:
                continue
                
            txs_with_impact.append({
                'transaction_hash': tx_hash,
                'timestamp': row[timestamp_col],
                'from': row[from_col],
                'to': row[to_col],
                'pre_price': tx_impact['pre_price'],
                'post_price': tx_impact['post_price'],
                'impact_pct': tx_impact['impact_pct']
            })
        
        if not txs_with_impact:
            return []
            
        impact_df = pd.DataFrame(txs_with_impact)
        impact_df = impact_df.sort_values(by='timestamp')
        
        # Look for accumulation phases followed by price pumps and then dumps
        pump_and_dumps = []
        
        # Group by address to analyze per wallet
        address_groups = {}
        
        for from_addr in impact_df['from'].unique():
            address_groups[from_addr] = impact_df[impact_df['from'] == from_addr]
        
        for to_addr in impact_df['to'].unique():
            if to_addr in address_groups:
                address_groups[to_addr] = pd.concat([
                    address_groups[to_addr],
                    impact_df[impact_df['to'] == to_addr]
                ])
            else:
                address_groups[to_addr] = impact_df[impact_df['to'] == to_addr]
        
        for address, addr_df in address_groups.items():
            # Skip if not enough transactions
            if len(addr_df) < accumulation_threshold + 2:
                continue
                
            # Look for continuous price increase followed by sharp drop
            window_size = min(len(addr_df), 10)
            for i in range(len(addr_df) - window_size + 1):
                window = addr_df.iloc[i:i+window_size]
                
                # Get cumulative price change in window
                if len(window) >= 2:
                    first_price = window.iloc[0]['pre_price']
                    last_price = window.iloc[-1]['post_price']
                    
                    if first_price is None or last_price is None:
                        continue
                        
                    cumulative_change = ((last_price - first_price) / first_price) * 100
                    
                    # Check for pump phase
                    max_price = window['post_price'].max()
                    max_idx = window['post_price'].idxmax()
                    
                    if max_idx < len(window) - 1:
                        max_to_end = ((window.iloc[-1]['post_price'] - max_price) / max_price) * 100
                        
                        # If we have a pump followed by a dump
                        if (cumulative_change > pump_threshold or 
                            any(window['impact_pct'] > pump_threshold)) and max_to_end < dump_threshold:
                            
                            # Create chart
                            fig = go.Figure()
                            
                            # Plot price line
                            times = [t.timestamp() for t in window['timestamp']]
                            prices = []
                            for _, row in window.iterrows():
                                prices.append(row['pre_price'])
                                prices.append(row['post_price'])
                            
                            times_expanded = []
                            for t in times:
                                times_expanded.append(t - 60)  # 1 min before
                                times_expanded.append(t + 60)  # 1 min after
                            
                            fig.add_trace(go.Scatter(
                                x=times_expanded,
                                y=prices,
                                mode='lines+markers',
                                name='Price',
                                line=dict(color='blue')
                            ))
                            
                            # Highlight pump and dump phases
                            max_time_idx = window.index.get_loc(max_idx)
                            pump_x = times_expanded[:max_time_idx*2+2]
                            pump_y = prices[:max_time_idx*2+2]
                            
                            dump_x = times_expanded[max_time_idx*2:]
                            dump_y = prices[max_time_idx*2:]
                            
                            fig.add_trace(go.Scatter(
                                x=pump_x,
                                y=pump_y,
                                mode='lines',
                                line=dict(color='green', width=3),
                                name='Pump Phase'
                            ))
                            
                            fig.add_trace(go.Scatter(
                                x=dump_x,
                                y=dump_y,
                                mode='lines',
                                line=dict(color='red', width=3),
                                name='Dump Phase'
                            ))
                            
                            fig.update_layout(
                                title='Potential Pump and Dump Pattern',
                                xaxis_title='Time',
                                yaxis_title='Price',
                                hovermode='closest'
                            )
                            
                            pump_and_dumps.append({
                                "type": "Pump and Dump",
                                "addresses": [address],
                                "risk_level": "High" if max_to_end < dump_threshold * 1.5 else "Medium",
                                "description": f"Price pumped {cumulative_change:.2f}% before dropping {max_to_end:.2f}%",
                                "detection_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                                "title": f"Pump ({cumulative_change:.1f}%) and Dump ({max_to_end:.1f}%)",
                                "evidence": window,
                                "chart": fig
                            })
        
        return pump_and_dumps