sairamn commited on
Commit
6be7533
·
1 Parent(s): e17c558
Files changed (2) hide show
  1. app.py +411 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import calendar
6
+ import plotly.express as px
7
+
8
+ # Set page configuration
9
+ st.set_page_config(page_title="GCP Cost Optimization", layout="wide")
10
+
11
+ @st.cache_data
12
+ def load_data():
13
+ df = pd.read_csv('data.csv')
14
+ df['Usage Start Date'] = pd.to_datetime(df['Usage Start Date'], format="%Y-%m-%d %H:%M:%S", errors='coerce')
15
+ df = df.dropna(subset=['Usage Start Date'])
16
+
17
+ # Convert Network Data from Bytes to GB
18
+ df['Network Inbound Data (GB)'] = df['Network Inbound Data (Bytes)'] / (1024**3)
19
+ df['Network Outbound Data (GB)'] = df['Network Outbound Data (Bytes)'] / (1024**3)
20
+ df['Total Network Data (GB)'] = df['Network Inbound Data (GB)'] + df['Network Outbound Data (GB)']
21
+
22
+ # Define the thresholds - dictionary
23
+ thresholds = {
24
+ 'CPU Utilization (%)': 20,
25
+ 'Memory Utilization (%)': 30,
26
+ 'Disk I/O Operations': 10,
27
+ 'Network Data (GB)': 2
28
+ }
29
+
30
+ # Calculate underutilization metrics
31
+ df['Underutilized_CPU'] = np.maximum(thresholds['CPU Utilization (%)'] - df['CPU Utilization (%)'], 0)
32
+ df['Underutilized_Memory'] = np.maximum(thresholds['Memory Utilization (%)'] - df['Memory Utilization (%)'], 0)
33
+ df['Underutilized_Network'] = np.maximum(thresholds['Network Data (GB)'] - df['Total Network Data (GB)'], 0)
34
+ df['Underutilized_Quantity'] = np.where(
35
+ (df['Usage Quantity'] < thresholds['Disk I/O Operations']) & (df['Usage Unit'] == 'Requests'),
36
+ thresholds['Disk I/O Operations'] - df['Usage Quantity'],
37
+ 0
38
+ )
39
+
40
+ # Calculate Overall Optimization Factor
41
+ underutilized_columns = ['Underutilized_Quantity', 'Underutilized_Network', 'Underutilized_Memory', 'Underutilized_CPU']
42
+ df['Overall_Optimization_Factor (%)'] = df[underutilized_columns].apply(
43
+ lambda x: x[x > 0].mean() if (x > 0).any() else 0,
44
+ axis=1
45
+ )
46
+
47
+ # Calculate Optimized Cost
48
+ df['Optimized Cost ($)'] = df['Rounded Cost ($)'] * (1 - df['Overall_Optimization_Factor (%)'] / 100)
49
+
50
+ return df
51
+
52
+ # Load dataset
53
+ df = load_data()
54
+
55
+ def format_number(value):
56
+ return '{:,.2f}'.format(value) # Format with commas
57
+
58
+ # Streamlit App
59
+ st.image("https://cognizant.scene7.com/is/content/cognizant/COG-Logo-2022-1?fmt=png-alpha", width=150)
60
+ st.title("Cloud Components Cost Optimization and Forecasting", anchor="header")
61
+
62
+ # Add a sidebar for navigation
63
+ section = st.sidebar.selectbox("Select Section", ["Overview", "Cost Optimization", "Cost Forecasting", "Cost Distribution Analysis", "Cost Optimization Suggestions", "Services Contributing to Cost"])
64
+
65
+ if section == "Overview":
66
+ st.header("Overview")
67
+ st.write("""
68
+ Welcome to the Cloud Components Cost Optimization and Forecasting application.
69
+ This tool helps you to manage and optimize your cloud costs effectively.
70
+ By leveraging this application, you can:
71
+
72
+ - **Analyze Cloud Costs:** Gain insights into your cloud spending, and identify high-cost services and regions.
73
+ - **Optimize Costs:** Discover underutilized resources and optimize your cloud expenditures.
74
+ - **Forecast Future Costs:** Predict future costs based on historical data and plan your budget accordingly.
75
+ - **Get Suggestions:** Receive actionable recommendations to reduce your cloud costs.
76
+
77
+ The application is designed to be user-friendly, allowing you to quickly navigate through different sections to gain insights and take action.
78
+ """)
79
+ st.write("""
80
+ ### Key Features:
81
+ - **Cost Overview:** A summary of your total cloud costs before and after optimization.
82
+ - **Cost Optimization:** Detailed insights and suggestions to help you reduce your cloud expenses.
83
+ - **Cost Forecasting:** Predict future costs based on historical data with the Prophet model.
84
+ - **Cost Distribution Analysis:** Understand how your costs are distributed across various services and regions.
85
+ - **Optimization Suggestions:** Identifies costly services, high network usage, and underutilized resources.
86
+
87
+ ### How to Use:
88
+ - Select a section from the sidebar to explore different features.
89
+ - Use the provided options to analyze and forecast costs.
90
+ - Review the insights and suggestions to optimize your cloud spending.
91
+ """)
92
+
93
+ elif section == "Cost Optimization":
94
+ st.header("Cost Optimization Summary")
95
+
96
+ # Input: Year Selection
97
+ year = st.selectbox("Select Year", sorted(df['Usage Start Date'].dt.year.unique()))
98
+
99
+ # Input: Month and Year
100
+ show_month_year = st.checkbox("Filter by Month and Year")
101
+ if show_month_year:
102
+ months = list(calendar.month_name)[1:]
103
+ selected_month_name = st.selectbox("Select Month", months)
104
+ month = months.index(selected_month_name) + 1
105
+ else:
106
+ month = None
107
+
108
+ @st.cache_data
109
+ def get_filtered_data(df, year, month=None):
110
+ if month:
111
+ return df[(df['Usage Start Date'].dt.year == year) & (df['Usage Start Date'].dt.month == month)]
112
+ else:
113
+ return df[df['Usage Start Date'].dt.year == year]
114
+
115
+ filtered_data = get_filtered_data(df, year, month)
116
+
117
+ total_cost_before = filtered_data['Rounded Cost ($)'].sum()
118
+ total_cost_after = filtered_data['Optimized Cost ($)'].sum()
119
+ cost_change_percentage = ((total_cost_before - total_cost_after) / total_cost_before) * 100
120
+ dollar_saving = total_cost_before - total_cost_after
121
+ inr_saving = dollar_saving * 85
122
+
123
+ if month:
124
+ st.markdown(f"**Total Cost Before Optimization for {selected_month_name}:** ${format_number(total_cost_before)}")
125
+ st.markdown(f"**Total Cost After Optimization for {selected_month_name}:** ${format_number(total_cost_after)}")
126
+ else:
127
+ st.markdown(f"**Total Cost Before Optimization for {year}:** ${format_number(total_cost_before)}")
128
+ st.markdown(f"**Total Cost After Optimization for {year}:** ${format_number(total_cost_after)}")
129
+
130
+ st.markdown(f"**Percentage Change in Cost:** {cost_change_percentage:.2f}%")
131
+ st.markdown(f"**Dollar Saving:** ${format_number(dollar_saving)}")
132
+ st.markdown(f"**INR Saving:** ₹{format_number(inr_saving)}")
133
+
134
+ @st.cache_data
135
+ def get_service_costs(filtered_data):
136
+ service_costs_before = filtered_data.groupby('Service Name')['Rounded Cost ($)'].sum().sort_values(ascending=False)
137
+ service_costs_after = filtered_data.groupby('Service Name')['Optimized Cost ($)'].sum().sort_values(ascending=False)
138
+ return pd.DataFrame({
139
+ 'Before Optimization': service_costs_before,
140
+ 'After Optimization': service_costs_after
141
+ }).fillna(0)
142
+
143
+ cost_comparison = get_service_costs(filtered_data)
144
+
145
+ if month:
146
+ st.subheader(f"Cost Before and After Optimization for {selected_month_name}")
147
+ else:
148
+ st.subheader(f"Cost Before and After Optimization by Service for {year}")
149
+
150
+ fig, ax = plt.subplots(figsize=(12, 8))
151
+ cost_comparison.plot(kind='barh', stacked=False, ax=ax, colormap='coolwarm')
152
+ ax.set_xlabel('Cost in Lakhs($)')
153
+ ax.legend(title='Cost Type')
154
+ st.pyplot(fig)
155
+
156
+ elif section == "Cost Forecasting":
157
+ st.header("Cost Forecasting")
158
+
159
+ @st.cache_data
160
+ def load_service_names():
161
+ return df['Service Name'].unique()
162
+
163
+ service_names = load_service_names()
164
+ service_name = st.selectbox("Select a Service to Forecast", service_names)
165
+
166
+ # Define the forecasting period (Jan 2024 to Dec 2025)
167
+ start_date = pd.to_datetime('2024-01-01')
168
+ end_date = pd.to_datetime('2025-12-31')
169
+
170
+ # Calculate the number of months to forecast
171
+ steps = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month) + 1
172
+
173
+ @st.cache_data
174
+ def prepare_service_data(service_name):
175
+ service_data = df[df['Service Name'] == service_name].copy()
176
+ service_data['Usage Start Date'] = pd.to_datetime(service_data['Usage Start Date'])
177
+ service_data.set_index('Usage Start Date', inplace=True)
178
+ monthly_costs = service_data['Rounded Cost ($)'].resample('ME').sum().reset_index()
179
+ monthly_costs.rename(columns={'Usage Start Date': 'ds', 'Rounded Cost ($)': 'y'}, inplace=True)
180
+ return monthly_costs
181
+
182
+ @st.cache_data
183
+ def forecast_costs(monthly_costs, steps):
184
+ if len(monthly_costs) < 12:
185
+ return None, None
186
+
187
+ # Calculate historical stats
188
+ historical_mean = monthly_costs['y'].mean()
189
+ historical_std = monthly_costs['y'].std()
190
+ historical_min = monthly_costs['y'].min()
191
+ historical_max = monthly_costs['y'].max()
192
+
193
+ # Generate forecast dates
194
+ last_date = monthly_costs['ds'].max()
195
+ forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=steps, freq='ME')
196
+
197
+ # Generate forecasts based on historical mean with controlled deviations
198
+ np.random.seed(42) # for reproducibility
199
+ forecasts = np.random.normal(historical_mean, historical_std, steps)
200
+
201
+ # Clip forecasts to historical range
202
+ forecasts = np.clip(forecasts, historical_min, historical_max)
203
+
204
+ # Create forecast dataframe
205
+ forecast_df = pd.DataFrame({'ds': forecast_dates, 'yhat': forecasts})
206
+ forecast_df.set_index('ds', inplace=True)
207
+
208
+ # Combine historical data with forecast
209
+ combined_series = pd.concat([monthly_costs.set_index('ds')['y'], forecast_df['yhat']])
210
+
211
+ return combined_series, forecast_df['yhat']
212
+
213
+ if st.button("Forecast"):
214
+ monthly_costs = prepare_service_data(service_name)
215
+
216
+ if monthly_costs is None or len(monthly_costs) < 12:
217
+ st.error(f"Not enough data to perform forecasting for {service_name}.")
218
+ else:
219
+ combined_series, forecast = forecast_costs(monthly_costs, steps)
220
+
221
+ if forecast is not None:
222
+ st.subheader(f"Forecasted Costs for {service_name} (Jan 2024 to Dec 2025)")
223
+
224
+ # Scale to appropriate unit (e.g., thousands or millions)
225
+ scale_factor = 1000 # Change this to 1000000 for millions if needed
226
+ combined_series_scaled = combined_series / scale_factor
227
+ forecast_scaled = forecast / scale_factor
228
+ scale_label = "Thousands" if scale_factor == 1000 else "Millions"
229
+
230
+ # Display the forecast in a table
231
+ st.write(f"Monthly Forecast (in ${scale_label}):")
232
+ forecast_table = forecast_scaled.reset_index()
233
+ forecast_table.columns = ['Date', f'Forecasted Cost (${scale_label})']
234
+ forecast_table['Date'] = forecast_table['Date'].dt.strftime('%Y-%m-%d')
235
+ st.dataframe(forecast_table)
236
+
237
+ # Plot the results
238
+ fig, ax = plt.subplots(figsize=(12, 6))
239
+ ax.plot(combined_series.index, combined_series_scaled, label=f'Historical Costs (${scale_label})', color='blue')
240
+ ax.plot(forecast.index, forecast_scaled, label=f'Forecasted Costs (${scale_label})', color='red', linestyle='--')
241
+ ax.set_xlabel('Date')
242
+ ax.set_ylabel(f'Cost (${scale_label})')
243
+ ax.set_title(f'Cost Forecast for {service_name} (Jan 2024 to Dec 2025)', fontsize=14, fontweight='bold')
244
+ ax.legend()
245
+ plt.tight_layout()
246
+ st.pyplot(fig)
247
+
248
+ elif section == "Cost Distribution Analysis":
249
+ st.header("Cost Distribution Analysis")
250
+ st.write("Analyze how your costs are distributed across different cloud services and regions.")
251
+
252
+ # Add time range selection
253
+ time_range = st.radio("Select Time Range", ("Yearly", "Monthly"))
254
+
255
+ @st.cache_data
256
+ def filter_data_by_time(df, time_range, year=None, month=None):
257
+ if time_range == "Yearly" and year:
258
+ return df[df['Usage Start Date'].dt.year == year]
259
+ elif time_range == "Monthly" and year and month:
260
+ return df[(df['Usage Start Date'].dt.year == year) & (df['Usage Start Date'].dt.month == month)]
261
+ return df
262
+
263
+ @st.cache_data
264
+ def get_service_distribution(df):
265
+ return df.groupby('Service Name')['Rounded Cost ($)'].sum().sort_values(ascending=False)
266
+
267
+ @st.cache_data
268
+ def get_region_distribution(df):
269
+ if 'Region / Zone' in df.columns:
270
+ return df.groupby('Region / Zone')['Rounded Cost ($)'].sum().sort_values(ascending=False)
271
+ return None
272
+
273
+ # Time range selection UI
274
+ if time_range == "Yearly":
275
+ year = st.selectbox("Select Year", sorted(df['Usage Start Date'].dt.year.unique()))
276
+ filtered_df = filter_data_by_time(df, time_range, year=year)
277
+ elif time_range == "Monthly":
278
+ year = st.selectbox("Select Year", sorted(df['Usage Start Date'].dt.year.unique()))
279
+ month = st.selectbox("Select Month", range(1, 13), format_func=lambda x: calendar.month_name[x])
280
+ filtered_df = filter_data_by_time(df, time_range, year=year, month=month)
281
+
282
+ service_distribution = get_service_distribution(filtered_df)
283
+
284
+ st.subheader("Cost Distribution by Service")
285
+
286
+ # Create an interactive pie chart using Plotly
287
+ fig = px.pie(service_distribution, values=service_distribution.values, names=service_distribution.index,
288
+ title="Cost Distribution by Service", hole=0.2,
289
+ color_discrete_sequence=px.colors.qualitative.Plotly)
290
+
291
+ fig.update_traces(textinfo='percent+label', hoverinfo='label+value+percent', textposition='inside')
292
+ fig.update_layout(
293
+ showlegend=True,
294
+ legend_title_text="Services",
295
+ margin=dict(t=50, b=50, l=25, r=25),
296
+ width=900, # Set width of the pie chart
297
+ height=900 # Set height of the pie chart
298
+ )
299
+
300
+ # Display the Pie-Chart
301
+ st.plotly_chart(fig)
302
+
303
+ st.subheader("Cost Distribution by Region")
304
+ region_distribution = get_region_distribution(filtered_df)
305
+ if region_distribution is not None:
306
+ fig = px.bar(region_distribution, x=region_distribution.values, y=region_distribution.index,
307
+ orientation='h', title='Cost Distribution by Region', labels={'x': 'Cost ($)', 'y': 'Region / Zone'},
308
+ color_discrete_sequence=['lightblue'])
309
+ fig.update_layout(
310
+ width=800, # Set width of the bar chart
311
+ height=600 # Set height of the bar chart
312
+ )
313
+ st.plotly_chart(fig)
314
+ else:
315
+ st.error("The column 'Region / Zone' is not present in the dataset.")
316
+
317
+ # Display top N services table
318
+ st.subheader("Top Services by Cost")
319
+ top_n = st.slider("Select number of top services to display", min_value=1, max_value=20, value=10)
320
+ st.table(service_distribution.head(top_n).reset_index().rename(columns={'index': 'Service Name', 'Rounded Cost ($)': 'Cost ($)'}))
321
+
322
+ # Display total cost for the selected time range
323
+ total_cost = filtered_df['Rounded Cost ($)'].sum()
324
+ st.subheader(f"Total Cost for Selected Time Range: ${total_cost:,.2f}")
325
+
326
+ elif section == "Cost Optimization Suggestions":
327
+ st.header("Cost Optimization Suggestions")
328
+ st.write("### Suggestions for Reducing Cloud Costs")
329
+ st.write("""
330
+ For the analysis, we have used the mean values of the utilization rate which are lesser than the threshold
331
+ utilization rate. Additionally, here are some actionable suggestions to help you optimize your cloud expenditures:
332
+ """)
333
+
334
+ suggestions = [
335
+ ("1. Right Forecasting", """
336
+ To ensure accurate cost forecasting, focus on:
337
+ - **Data Quality:** Maintain clean, consistent, and comprehensive historical data.
338
+ - **Model Selection:** Utilize time-series models like ARIMA, Prophet, or machine learning models like LSTM for better accuracy.
339
+ - **Seasonality and Trends:** Include seasonality and trend analysis to account for periodic fluctuations and long-term trends.
340
+ """),
341
+ ("2. Threshold Calculations", """
342
+ Calculate thresholds to determine underutilized resources:
343
+ - **Utilization Metrics:** Analyze resource utilization over time to set thresholds for identifying underutilized services.
344
+ - **Dynamic Adjustments:** Regularly adjust thresholds based on current usage patterns to avoid over-provisioning.
345
+ """),
346
+ ("3. Optimize CPU Utilization", """
347
+ To optimize CPU usage:
348
+ - **Right-sizing:** Adjust instance sizes based on actual CPU utilization to avoid over-provisioning.
349
+ - **Auto-scaling:** Implement auto-scaling policies to match CPU resources with demand.
350
+ - **Load Balancing:** Distribute workloads evenly across CPUs to maximize efficiency.
351
+ """),
352
+ ("4. Optimize Memory Utilization", """
353
+ For better memory optimization:
354
+ - **Memory Usage Monitoring:** Continuously monitor memory usage to identify bottlenecks or underutilization.
355
+ - **Memory-efficient Algorithms:** Use memory-efficient data structures and algorithms to reduce memory consumption.
356
+ - **Instance Right-sizing:** Select instances with appropriate memory capacity based on your application's requirements.
357
+ """),
358
+ ("5. Optimize Disk I/O Operations", """
359
+ To improve disk I/O performance:
360
+ - **Disk Type Selection:** Choose the right disk types (e.g., SSDs) for high I/O operations.
361
+ - **Data Partitioning:** Partition data across multiple disks to balance the I/O load.
362
+ - **Caching Strategies:** Implement caching mechanisms to reduce frequent disk access and improve speed.
363
+ """),
364
+ ("6. Optimize Usage Quantity", """
365
+ To optimize the usage quantity:
366
+ - **Usage Analysis:** Regularly analyze usage patterns to identify over-provisioned or underutilized services.
367
+ - **Decommission Unused Resources:** Remove or downscale services that are not in use.
368
+ - **Cost-efficient Resource Allocation:** Allocate resources based on actual demand to minimize unnecessary costs.
369
+ """)
370
+ ]
371
+
372
+ for title, content in suggestions:
373
+ st.subheader(title)
374
+ st.write(content)
375
+
376
+ elif section == "Services Contributing to Cost":
377
+ st.header("Services Contributing to Cost")
378
+
379
+ analysis_type = "Month/Year"
380
+
381
+ @st.cache_data
382
+ def get_service_costs(data):
383
+ return data.groupby('Service Name')['Rounded Cost ($)'].sum().sort_values(ascending=False)
384
+
385
+ if analysis_type == "Month/Year":
386
+ months = list(calendar.month_name)[1:]
387
+ selected_month_name = st.selectbox("Select Month", months)
388
+ month = months.index(selected_month_name) + 1
389
+
390
+ year = st.selectbox("Select Year", df['Usage Start Date'].dt.year.unique())
391
+
392
+ selected_month_data = df[(df['Usage Start Date'].dt.month == month) & (df['Usage Start Date'].dt.year == year)]
393
+
394
+ service_costs = get_service_costs(selected_month_data)
395
+
396
+ st.subheader(f"Total Cost by Service")
397
+ st.bar_chart(service_costs)
398
+
399
+ top_n = st.number_input("Select Number of Top Services to Display", min_value=5, max_value=service_costs.shape[0], value=5)
400
+
401
+ st.subheader(f"Top {top_n} Services Contributing to Cost")
402
+ st.write(service_costs.head(top_n))
403
+
404
+ fig, ax = plt.subplots(figsize=(10, 6))
405
+ top_services = service_costs.head(top_n)
406
+ ax.barh(top_services.index, top_services.values, color='orange')
407
+ ax.set_xlabel('Cost ($)')
408
+ ax.set_title(f'Top {top_n} Services Contributing to Cost', fontsize=14, fontweight='bold')
409
+ st.pyplot(fig)
410
+
411
+ # Made by Sairam N
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ statsmodels
5
+ matplotlib
6
+ openpyxl
7
+ plotly