File size: 19,433 Bytes
6be7533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import calendar
import plotly.express as px

# Set page configuration
st.set_page_config(page_title="GCP Cost Optimization", layout="wide")

@st.cache_data
def load_data():
    df = pd.read_csv('data.csv')
    df['Usage Start Date'] = pd.to_datetime(df['Usage Start Date'], format="%Y-%m-%d %H:%M:%S", errors='coerce')
    df = df.dropna(subset=['Usage Start Date'])
    
    # Convert Network Data from Bytes to GB
    df['Network Inbound Data (GB)'] = df['Network Inbound Data (Bytes)'] / (1024**3)
    df['Network Outbound Data (GB)'] = df['Network Outbound Data (Bytes)'] / (1024**3)
    df['Total Network Data (GB)'] = df['Network Inbound Data (GB)'] + df['Network Outbound Data (GB)']
    
    # Define the thresholds - dictionary
    thresholds = {
        'CPU Utilization (%)': 20,
        'Memory Utilization (%)': 30,
        'Disk I/O Operations': 10,
        'Network Data (GB)': 2
    }
    
    # Calculate underutilization metrics
    df['Underutilized_CPU'] = np.maximum(thresholds['CPU Utilization (%)'] - df['CPU Utilization (%)'], 0)
    df['Underutilized_Memory'] = np.maximum(thresholds['Memory Utilization (%)'] - df['Memory Utilization (%)'], 0)
    df['Underutilized_Network'] = np.maximum(thresholds['Network Data (GB)'] - df['Total Network Data (GB)'], 0)
    df['Underutilized_Quantity'] = np.where(
        (df['Usage Quantity'] < thresholds['Disk I/O Operations']) & (df['Usage Unit'] == 'Requests'),
        thresholds['Disk I/O Operations'] - df['Usage Quantity'],
        0
    )
    
    # Calculate Overall Optimization Factor
    underutilized_columns = ['Underutilized_Quantity', 'Underutilized_Network', 'Underutilized_Memory', 'Underutilized_CPU']
    df['Overall_Optimization_Factor (%)'] = df[underutilized_columns].apply(
        lambda x: x[x > 0].mean() if (x > 0).any() else 0,
        axis=1
    )
    
    # Calculate Optimized Cost
    df['Optimized Cost ($)'] = df['Rounded Cost ($)'] * (1 - df['Overall_Optimization_Factor (%)'] / 100)
    
    return df

# Load dataset
df = load_data()

def format_number(value):
    return '{:,.2f}'.format(value)  # Format with commas

# Streamlit App
st.title("Cloud Components Cost Optimization and Forecasting", anchor="header")

# Add a sidebar for navigation
section = st.sidebar.selectbox("Select Section", ["Overview", "Cost Optimization", "Cost Forecasting", "Cost Distribution Analysis", "Cost Optimization Suggestions", "Services Contributing to Cost"])

if section == "Overview":
    st.header("Overview")
    st.write("""
        Welcome to the Cloud Components Cost Optimization and Forecasting application. 
        This tool helps you to manage and optimize your cloud costs effectively. 
        By leveraging this application, you can:
        
        - **Analyze Cloud Costs:** Gain insights into your cloud spending, and identify high-cost services and regions.
        - **Optimize Costs:** Discover underutilized resources and optimize your cloud expenditures.
        - **Forecast Future Costs:** Predict future costs based on historical data and plan your budget accordingly.
        - **Get Suggestions:** Receive actionable recommendations to reduce your cloud costs.

        The application is designed to be user-friendly, allowing you to quickly navigate through different sections to gain insights and take action.
    """)
    st.write("""
        ### Key Features:
        - **Cost Overview:** A summary of your total cloud costs before and after optimization.
        - **Cost Optimization:** Detailed insights and suggestions to help you reduce your cloud expenses.
        - **Cost Forecasting:** Predict future costs based on historical data with the Prophet model.
        - **Cost Distribution Analysis:** Understand how your costs are distributed across various services and regions.
        - **Optimization Suggestions:** Identifies costly services, high network usage, and underutilized resources.
        
        ### How to Use:
        - Select a section from the sidebar to explore different features.
        - Use the provided options to analyze and forecast costs.
        - Review the insights and suggestions to optimize your cloud spending.
    """)

elif section == "Cost Optimization":
    st.header("Cost Optimization Summary")

    # Input: Year Selection
    year = st.selectbox("Select Year", sorted(df['Usage Start Date'].dt.year.unique()))

    # Input: Month and Year
    show_month_year = st.checkbox("Filter by Month and Year")
    if show_month_year:
        months = list(calendar.month_name)[1:]
        selected_month_name = st.selectbox("Select Month", months)
        month = months.index(selected_month_name) + 1
    else:
        month = None

    @st.cache_data
    def get_filtered_data(df, year, month=None):
        if month:
            return df[(df['Usage Start Date'].dt.year == year) & (df['Usage Start Date'].dt.month == month)]
        else:
            return df[df['Usage Start Date'].dt.year == year]

    filtered_data = get_filtered_data(df, year, month)
    
    total_cost_before = filtered_data['Rounded Cost ($)'].sum()
    total_cost_after = filtered_data['Optimized Cost ($)'].sum()
    cost_change_percentage = ((total_cost_before - total_cost_after) / total_cost_before) * 100
    dollar_saving = total_cost_before - total_cost_after
    inr_saving = dollar_saving * 85

    if month:
        st.markdown(f"**Total Cost Before Optimization for {selected_month_name}:** ${format_number(total_cost_before)}")
        st.markdown(f"**Total Cost After Optimization for {selected_month_name}:** ${format_number(total_cost_after)}")
    else:
        st.markdown(f"**Total Cost Before Optimization for {year}:** ${format_number(total_cost_before)}")
        st.markdown(f"**Total Cost After Optimization for {year}:** ${format_number(total_cost_after)}")
    
    st.markdown(f"**Percentage Change in Cost:** {cost_change_percentage:.2f}%")
    st.markdown(f"**Dollar Saving:** ${format_number(dollar_saving)}")
    st.markdown(f"**INR Saving:** ₹{format_number(inr_saving)}")

    @st.cache_data
    def get_service_costs(filtered_data):
        service_costs_before = filtered_data.groupby('Service Name')['Rounded Cost ($)'].sum().sort_values(ascending=False)
        service_costs_after = filtered_data.groupby('Service Name')['Optimized Cost ($)'].sum().sort_values(ascending=False)
        return pd.DataFrame({
            'Before Optimization': service_costs_before,
            'After Optimization': service_costs_after
        }).fillna(0)

    cost_comparison = get_service_costs(filtered_data)

    if month:
        st.subheader(f"Cost Before and After Optimization for {selected_month_name}")
    else:
        st.subheader(f"Cost Before and After Optimization by Service for {year}")

    fig, ax = plt.subplots(figsize=(12, 8))
    cost_comparison.plot(kind='barh', stacked=False, ax=ax, colormap='coolwarm')
    ax.set_xlabel('Cost in Lakhs($)')
    ax.legend(title='Cost Type')
    st.pyplot(fig)

elif section == "Cost Forecasting":
    st.header("Cost Forecasting")
    
    @st.cache_data
    def load_service_names():
        return df['Service Name'].unique()

    service_names = load_service_names()
    service_name = st.selectbox("Select a Service to Forecast", service_names)

    # Define the forecasting period (Jan 2024 to Dec 2025)
    start_date = pd.to_datetime('2024-01-01')
    end_date = pd.to_datetime('2025-12-31')

    # Calculate the number of months to forecast
    steps = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month) + 1

    @st.cache_data
    def prepare_service_data(service_name):
        service_data = df[df['Service Name'] == service_name].copy()
        service_data['Usage Start Date'] = pd.to_datetime(service_data['Usage Start Date'])
        service_data.set_index('Usage Start Date', inplace=True)
        monthly_costs = service_data['Rounded Cost ($)'].resample('ME').sum().reset_index()
        monthly_costs.rename(columns={'Usage Start Date': 'ds', 'Rounded Cost ($)': 'y'}, inplace=True)
        return monthly_costs

    @st.cache_data
    def forecast_costs(monthly_costs, steps):
        if len(monthly_costs) < 12:
            return None, None

        # Calculate historical stats
        historical_mean = monthly_costs['y'].mean()
        historical_std = monthly_costs['y'].std()
        historical_min = monthly_costs['y'].min()
        historical_max = monthly_costs['y'].max()

        # Generate forecast dates
        last_date = monthly_costs['ds'].max()
        forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=steps, freq='ME')

        # Generate forecasts based on historical mean with controlled deviations
        np.random.seed(42)  # for reproducibility
        forecasts = np.random.normal(historical_mean, historical_std, steps)
        
        # Clip forecasts to historical range
        forecasts = np.clip(forecasts, historical_min, historical_max)

        # Create forecast dataframe
        forecast_df = pd.DataFrame({'ds': forecast_dates, 'yhat': forecasts})
        forecast_df.set_index('ds', inplace=True)

        # Combine historical data with forecast
        combined_series = pd.concat([monthly_costs.set_index('ds')['y'], forecast_df['yhat']])

        return combined_series, forecast_df['yhat']

    if st.button("Forecast"):
        monthly_costs = prepare_service_data(service_name)
        
        if monthly_costs is None or len(monthly_costs) < 12:
            st.error(f"Not enough data to perform forecasting for {service_name}.")
        else:
            combined_series, forecast = forecast_costs(monthly_costs, steps)
            
            if forecast is not None:
                st.subheader(f"Forecasted Costs for {service_name} (Jan 2024 to Dec 2025)")

                # Scale to appropriate unit (e.g., thousands or millions)
                scale_factor = 1000  # Change this to 1000000 for millions if needed
                combined_series_scaled = combined_series / scale_factor
                forecast_scaled = forecast / scale_factor
                scale_label = "Thousands" if scale_factor == 1000 else "Millions"

                # Display the forecast in a table
                st.write(f"Monthly Forecast (in ${scale_label}):")
                forecast_table = forecast_scaled.reset_index()
                forecast_table.columns = ['Date', f'Forecasted Cost (${scale_label})']
                forecast_table['Date'] = forecast_table['Date'].dt.strftime('%Y-%m-%d')
                st.dataframe(forecast_table)

                # Plot the results
                fig, ax = plt.subplots(figsize=(12, 6))
                ax.plot(combined_series.index, combined_series_scaled, label=f'Historical Costs (${scale_label})', color='blue')
                ax.plot(forecast.index, forecast_scaled, label=f'Forecasted Costs (${scale_label})', color='red', linestyle='--')
                ax.set_xlabel('Date')
                ax.set_ylabel(f'Cost (${scale_label})')
                ax.set_title(f'Cost Forecast for {service_name} (Jan 2024 to Dec 2025)', fontsize=14, fontweight='bold')
                ax.legend()
                plt.tight_layout()
                st.pyplot(fig)

elif section == "Cost Distribution Analysis":
    st.header("Cost Distribution Analysis")
    st.write("Analyze how your costs are distributed across different cloud services and regions.")

    # Add time range selection
    time_range = st.radio("Select Time Range", ("Yearly", "Monthly"))

    @st.cache_data
    def filter_data_by_time(df, time_range, year=None, month=None):
        if time_range == "Yearly" and year:
            return df[df['Usage Start Date'].dt.year == year]
        elif time_range == "Monthly" and year and month:
            return df[(df['Usage Start Date'].dt.year == year) & (df['Usage Start Date'].dt.month == month)]
        return df

    @st.cache_data
    def get_service_distribution(df):
        return df.groupby('Service Name')['Rounded Cost ($)'].sum().sort_values(ascending=False)

    @st.cache_data
    def get_region_distribution(df):
        if 'Region / Zone' in df.columns:
            return df.groupby('Region / Zone')['Rounded Cost ($)'].sum().sort_values(ascending=False)
        return None

    # Time range selection UI
    if time_range == "Yearly":
        year = st.selectbox("Select Year", sorted(df['Usage Start Date'].dt.year.unique()))
        filtered_df = filter_data_by_time(df, time_range, year=year)
    elif time_range == "Monthly":
        year = st.selectbox("Select Year", sorted(df['Usage Start Date'].dt.year.unique()))
        month = st.selectbox("Select Month", range(1, 13), format_func=lambda x: calendar.month_name[x])
        filtered_df = filter_data_by_time(df, time_range, year=year, month=month)

    service_distribution = get_service_distribution(filtered_df)

    st.subheader("Cost Distribution by Service")

    # Create an interactive pie chart using Plotly
    fig = px.pie(service_distribution, values=service_distribution.values, names=service_distribution.index,
                 title="Cost Distribution by Service", hole=0.2, 
                 color_discrete_sequence=px.colors.qualitative.Plotly)
    
    fig.update_traces(textinfo='percent+label', hoverinfo='label+value+percent', textposition='inside')
    fig.update_layout(
        showlegend=True,
        legend_title_text="Services",
        margin=dict(t=50, b=50, l=25, r=25),
        width=900,  # Set width of the pie chart
        height=900  # Set height of the pie chart
    )

    # Display the Pie-Chart
    st.plotly_chart(fig)

    st.subheader("Cost Distribution by Region")
    region_distribution = get_region_distribution(filtered_df)
    if region_distribution is not None:
        fig = px.bar(region_distribution, x=region_distribution.values, y=region_distribution.index, 
                     orientation='h', title='Cost Distribution by Region', labels={'x': 'Cost ($)', 'y': 'Region / Zone'},
                     color_discrete_sequence=['lightblue'])
        fig.update_layout(
            width=800,  # Set width of the bar chart
            height=600  # Set height of the bar chart
        )
        st.plotly_chart(fig)
    else:
        st.error("The column 'Region / Zone' is not present in the dataset.")

    # Display top N services table
    st.subheader("Top Services by Cost")
    top_n = st.slider("Select number of top services to display", min_value=1, max_value=20, value=10)
    st.table(service_distribution.head(top_n).reset_index().rename(columns={'index': 'Service Name', 'Rounded Cost ($)': 'Cost ($)'}))

    # Display total cost for the selected time range
    total_cost = filtered_df['Rounded Cost ($)'].sum()
    st.subheader(f"Total Cost for Selected Time Range: ${total_cost:,.2f}")

elif section == "Cost Optimization Suggestions":
    st.header("Cost Optimization Suggestions")
    st.write("### Suggestions for Reducing Cloud Costs")
    st.write("""
    For the analysis, we have used the mean values of the utilization rate which are lesser than the threshold 
    utilization rate. Additionally, here are some actionable suggestions to help you optimize your cloud expenditures:
    """)

    suggestions = [
        ("1. Right Forecasting", """
        To ensure accurate cost forecasting, focus on:
        - **Data Quality:** Maintain clean, consistent, and comprehensive historical data.
        - **Model Selection:** Utilize time-series models like ARIMA, Prophet, or machine learning models like LSTM for better accuracy.
        - **Seasonality and Trends:** Include seasonality and trend analysis to account for periodic fluctuations and long-term trends.
        """),
        ("2. Threshold Calculations", """
        Calculate thresholds to determine underutilized resources:
        - **Utilization Metrics:** Analyze resource utilization over time to set thresholds for identifying underutilized services.
        - **Dynamic Adjustments:** Regularly adjust thresholds based on current usage patterns to avoid over-provisioning.
        """),
        ("3. Optimize CPU Utilization", """
        To optimize CPU usage:
        - **Right-sizing:** Adjust instance sizes based on actual CPU utilization to avoid over-provisioning.
        - **Auto-scaling:** Implement auto-scaling policies to match CPU resources with demand.
        - **Load Balancing:** Distribute workloads evenly across CPUs to maximize efficiency.
        """),
        ("4. Optimize Memory Utilization", """
        For better memory optimization:
        - **Memory Usage Monitoring:** Continuously monitor memory usage to identify bottlenecks or underutilization.
        - **Memory-efficient Algorithms:** Use memory-efficient data structures and algorithms to reduce memory consumption.
        - **Instance Right-sizing:** Select instances with appropriate memory capacity based on your application's requirements.
        """),
        ("5. Optimize Disk I/O Operations", """
        To improve disk I/O performance:
        - **Disk Type Selection:** Choose the right disk types (e.g., SSDs) for high I/O operations.
        - **Data Partitioning:** Partition data across multiple disks to balance the I/O load.
        - **Caching Strategies:** Implement caching mechanisms to reduce frequent disk access and improve speed.
        """),
        ("6. Optimize Usage Quantity", """
        To optimize the usage quantity:
        - **Usage Analysis:** Regularly analyze usage patterns to identify over-provisioned or underutilized services.
        - **Decommission Unused Resources:** Remove or downscale services that are not in use.
        - **Cost-efficient Resource Allocation:** Allocate resources based on actual demand to minimize unnecessary costs.
        """)
    ]

    for title, content in suggestions:
        st.subheader(title)
        st.write(content)

elif section == "Services Contributing to Cost":
    st.header("Services Contributing to Cost")
    
    analysis_type = "Month/Year"
    
    @st.cache_data
    def get_service_costs(data):
        return data.groupby('Service Name')['Rounded Cost ($)'].sum().sort_values(ascending=False)

    if analysis_type == "Month/Year":
        months = list(calendar.month_name)[1:]
        selected_month_name = st.selectbox("Select Month", months)
        month = months.index(selected_month_name) + 1
        
        year = st.selectbox("Select Year", df['Usage Start Date'].dt.year.unique())

        selected_month_data = df[(df['Usage Start Date'].dt.month == month) & (df['Usage Start Date'].dt.year == year)]

    service_costs = get_service_costs(selected_month_data)

    st.subheader(f"Total Cost by Service")
    st.bar_chart(service_costs)

    top_n = st.number_input("Select Number of Top Services to Display", min_value=5, max_value=service_costs.shape[0], value=5)

    st.subheader(f"Top {top_n} Services Contributing to Cost")
    st.write(service_costs.head(top_n))

    fig, ax = plt.subplots(figsize=(10, 6))
    top_services = service_costs.head(top_n)
    ax.barh(top_services.index, top_services.values, color='orange')
    ax.set_xlabel('Cost ($)')
    ax.set_title(f'Top {top_n} Services Contributing to Cost', fontsize=14, fontweight='bold')
    st.pyplot(fig)

# Made by Sairam N