_data = r"/content/SuperKart.csv"
#_data = r"G:\Mi unidad\Documentos\Data Science\"
_data_original = pd.read_csv(_data)
_data_copy = _data_original.copy()

_data_copy.head()

print(_data_copy.shape)
print("There are", _data_copy.shape[0], 'rows and', _data_copy.shape[1], "columns.")

(8763, 12)
There are 8763 rows and 12 columns.

_data_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Product_Id                 8763 non-null   object 
 1   Product_Weight             8763 non-null   float64
 2   Product_Sugar_Content      8763 non-null   object 
 3   Product_Allocated_Area     8763 non-null   float64
 4   Product_Type               8763 non-null   object 
 5   Product_MRP                8763 non-null   float64
 6   Store_Id                   8763 non-null   object 
 7   Store_Establishment_Year   8763 non-null   int64  
 8   Store_Size                 8763 non-null   object 
 9   Store_Location_City_Type   8763 non-null   object 
 10  Store_Type                 8763 non-null   object 
 11  Product_Store_Sales_Total  8763 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 821.7+ KB

_data_copy.describe(include='all')

_data_copy["Product_Sugar_Content"] = _data_copy["Product_Sugar_Content"].astype("category")
_data_copy["Product_Type"] = _data_copy["Product_Type"].astype("category")
_data_copy["Store_Size"] = _data_copy["Store_Size"].astype("category")
_data_copy["Store_Location_City_Type"] = _data_copy["Store_Location_City_Type"].astype("category")
_data_copy["Store_Type"] = _data_copy["Store_Type"].astype("category")

_cat = _data_copy.select_dtypes(include=['category']).columns.tolist()

for column in _cat:
    print(_data_copy[column].value_counts())
    print("-" * 25)

Product_Sugar_Content
Low Sugar    4885
Regular      2251
No Sugar     1519
reg           108
Name: count, dtype: int64
-------------------------
Product_Type
Fruits and Vegetables    1249
Snack Foods              1149
Frozen Foods              811
Dairy                     796
Household                 740
Baking Goods              716
Canned                    677
Health and Hygiene        628
Meat                      618
Soft Drinks               519
Breads                    200
Hard Drinks               186
Others                    151
Starchy Foods             141
Breakfast                 106
Seafood                    76
Name: count, dtype: int64
-------------------------
Store_Size
Medium    6025
High      1586
Small     1152
Name: count, dtype: int64
-------------------------
Store_Location_City_Type
Tier 2    6262
Tier 1    1349
Tier 3    1152
Name: count, dtype: int64
-------------------------
Store_Type
Supermarket Type2     4676
Supermarket Type1     1586
Departmental Store    1349
Food Mart             1152
Name: count, dtype: int64
-------------------------

for _variable in _cat:
  print(_data_copy[_variable].unique())

['Low Sugar', 'Regular', 'No Sugar', 'reg']
Categories (4, object): ['Low Sugar', 'No Sugar', 'Regular', 'reg']
['Frozen Foods', 'Dairy', 'Canned', 'Baking Goods', 'Health and Hygiene', ..., 'Soft Drinks', 'Breakfast', 'Others', 'Starchy Foods', 'Seafood']
Length: 16
Categories (16, object): ['Baking Goods', 'Breads', 'Breakfast', 'Canned', ..., 'Seafood', 'Snack Foods',
                          'Soft Drinks', 'Starchy Foods']
['Medium', 'High', 'Small']
Categories (3, object): ['High', 'Medium', 'Small']
['Tier 2', 'Tier 1', 'Tier 3']
Categories (3, object): ['Tier 1', 'Tier 2', 'Tier 3']
['Supermarket Type2', 'Departmental Store', 'Supermarket Type1', 'Food Mart']
Categories (4, object): ['Departmental Store', 'Food Mart', 'Supermarket Type1', 'Supermarket Type2']

_num_col = _data_copy.select_dtypes(include=np.number).columns.tolist()
for _variable in _num_col:
  print(_variable, ": ", len(_data_copy[_variable].unique()))

Product_Weight :  1113
Product_Allocated_Area :  228
Product_MRP :  6100
Store_Establishment_Year :  4
Product_Store_Sales_Total :  8668

print("There are",_data_copy.duplicated().sum(), "duplicated rows")

There are 0 duplicated rows

_null_values = _data_copy.isnull().sum()
print(_null_values[_null_values > 0])

Series([], dtype: int64)

plt.figure(figsize=(22, 12))
_columns = ['Product_Weight','Product_Sugar_Content','Product_MRP',
            'Store_Id','Store_Size','Store_Location_City_Type','Store_Type','Product_Store_Sales_Total']
for _cont, _variable in enumerate(_columns):
    plt.subplot(4, 4, _cont+1)
    sns.histplot(data=_data_copy, x=_variable)
    plt.xticks(rotation=30)
    plt.tight_layout();

sns.histplot(data=_data_copy, x='Product_Weight');

labeled_barplot(_data_copy, "Product_Sugar_Content")

_percentage_types = _data_copy['Product_Sugar_Content'].value_counts(normalize=True, sort=False)*100
_percentage_types.round(1)

histogram_boxplot(_data_copy, "Product_Allocated_Area")

labeled_barplot(_data_copy, "Product_Type")

_percentage_types = _data_copy['Product_Type'].value_counts(normalize=True, sort=False)*100
_percentage_types.round(1)

histogram_boxplot(_data_copy, "Product_MRP")

labeled_barplot(_data_copy, "Store_Id")

_percentage_types = _data_copy['Store_Id'].value_counts(normalize=True, sort=False)*100
_percentage_types.round(1)

labeled_barplot(_data_copy, "Store_Establishment_Year")

_percentage_types = _data_copy['Store_Establishment_Year'].value_counts(normalize=True, sort=False)*100
_percentage_types.round(1)

labeled_barplot(_data_copy, "Store_Size")

_percentage_types = _data_copy['Store_Size'].value_counts(normalize=True, sort=False)*100
_percentage_types.round(1)

labeled_barplot(_data_copy, "Store_Location_City_Type")

_percentage_types = _data_copy['Store_Location_City_Type'].value_counts(normalize=True, sort=False)*100
_percentage_types.round(1)

labeled_barplot(_data_copy, "Store_Type")

_percentage_types = _data_copy['Store_Type'].value_counts(normalize=True, sort=False)*100
_percentage_types.round(1)

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
plt.figure(figsize=(6,4))
sns.heatmap(_data_copy[_num_col].corr(),annot=True,cmap='Spectral',vmin=-1,vmax=1)
plt.show()

plt.figure(figsize=(8, 6))
sns.kdeplot(x='Product_Weight', y='Product_Store_Sales_Total', data=_data_copy, fill=True, cmap='viridis', cbar=True)
plt.title('Density plot - Product weight & Total product sales')
plt.xlabel('Product_Weight')
plt.ylabel('Product_Store_Sales_Total')
plt.grid(True)
plt.show()

sns.boxplot(x="Product_Sugar_Content", y="Product_Store_Sales_Total", data=_data_copy);

plt.figure(figsize=(8, 6))
sns.kdeplot(x='Product_Allocated_Area', y='Product_Store_Sales_Total', data=_data_copy, fill=True, cbar=True)
plt.title('Density plot - Allocated area & Total product sales')
plt.xlabel('Product_Allocated_Area')
plt.ylabel('Product_Store_Sales_Total')
plt.grid(True)
plt.show()

# Group by product type and sum up the total sales
_revenue_x_type = _data_copy.groupby(["Product_Type"], as_index=False)["Product_Store_Sales_Total"].sum()

colors = [plt.cm.Spectral(i / float(len(_revenue_x_type['Product_Type']))) for i in range(len(_revenue_x_type['Product_Type']))]

plt.figure(figsize=[12, 8])
squarify.plot(sizes=_revenue_x_type['Product_Store_Sales_Total'],
              label=_revenue_x_type['Product_Type'],
              color=colors,
              alpha=.8,
              value=_revenue_x_type['Product_Store_Sales_Total'])
plt.title('Total revenue per product type')
plt.axis('off')
plt.show()

_revenue_x_type.sort_values(by='Product_Store_Sales_Total', ascending=False)

plt.figure(figsize=(8, 6))
sns.kdeplot(x='Product_MRP', y='Product_Store_Sales_Total', data=_data_copy, fill=True, cmap='viridis', cbar=True)
plt.title('Density plot - Product MRP & Total product sales')
plt.xlabel('Product_MRP')
plt.ylabel('Product_Store_Sales_Total')
plt.grid(True)
plt.show()

sns.scatterplot(data=_data_copy, x='Product_MRP', y='Product_Store_Sales_Total')
plt.show()

# Group by Store_Id and sum up the total sales
_revenue_x_id = _data_copy.groupby(["Store_Id"], as_index=False)["Product_Store_Sales_Total"].sum()

colors = [plt.cm.Spectral(i / float(len(_revenue_x_id['Store_Id']))) for i in range(len(_revenue_x_id['Store_Id']))]

plt.figure(figsize=[8, 4])
squarify.plot(sizes=_revenue_x_id['Product_Store_Sales_Total'],
              label=_revenue_x_id['Store_Id'],
              color=colors,
              alpha=.8,
              value=_revenue_x_id['Product_Store_Sales_Total'])
plt.title('Total revenue per store ID')
plt.axis('off')
plt.show()

_revenue_x_id.sort_values(by='Product_Store_Sales_Total', ascending=False)

# Group by Store_Establishment_Year and sum up the total sales
_revenue_x_year = _data_copy.groupby(["Store_Establishment_Year"], as_index=False)["Product_Store_Sales_Total"].sum()

colors = [plt.cm.Spectral(i / float(len(_revenue_x_type['Store_Establishment_Year']))) for i in range(len(_revenue_x_type['Store_Establishment_Year']))]

plt.figure(figsize=[8, 4])
squarify.plot(sizes=_revenue_x_type['Product_Store_Sales_Total'],
              label=_revenue_x_type['Store_Establishment_Year'],
              color=colors,
              alpha=.8,
              value=_revenue_x_type['Product_Store_Sales_Total'])
plt.title('Total revenue per store year of establishment')
plt.axis('off')
plt.show()

_revenue_x_year.sort_values(by='Product_Store_Sales_Total', ascending=False)

sns.boxplot(x="Store_Size", y="Product_Store_Sales_Total", data=_data_copy);

summary_stats = _data_copy.groupby('Store_Size')['Product_Store_Sales_Total'].agg(
    mean='mean',
    Q1=lambda x: x.quantile(0.25),
    median='median',
    Q3=lambda x: x.quantile(0.75),
    min='min',
    max='max'
).reset_index()

print(summary_stats)

  Store_Size         mean         Q1    median         Q3      min      max
0       High  3923.778802  3285.5100  4139.645  4639.4000  2300.56  4997.63
1     Medium  3668.222573  3060.3800  3511.100  3969.8100  1561.06  8000.00
2      Small  1762.942465  1495.4725  1889.495  2133.6225    33.00  2299.63

# Group by Store_Location_City_Type and sum up the total sales
_revenue_x_location = _data_copy.groupby(["Store_Location_City_Type"], as_index=False)["Product_Store_Sales_Total"].sum()

colors = [plt.cm.Spectral(i / float(len(_revenue_x_location['Store_Location_City_Type']))) for i in range(len(_revenue_x_location['Store_Location_City_Type']))]

plt.figure(figsize=[8, 4])
squarify.plot(sizes=_revenue_x_location['Product_Store_Sales_Total'],
              label=_revenue_x_location['Store_Location_City_Type'],
              color=colors,
              alpha=.8,
              value=_revenue_x_location['Product_Store_Sales_Total'])
plt.title('Total revenue per store location')
plt.axis('off')
plt.show()

_revenue_x_location.sort_values(by='Product_Store_Sales_Total', ascending=False)

# Group by Store_Type and sum up the total sales
_revenue_store_type = _data_copy.groupby(["Store_Type"], as_index=False)["Product_Store_Sales_Total"].sum()

colors = [plt.cm.Spectral(i / float(len(_revenue_store_type['Store_Type']))) for i in range(len(_revenue_store_type['Store_Type']))]

plt.figure(figsize=[8, 4])
squarify.plot(sizes=_revenue_store_type['Product_Store_Sales_Total'],
              label=_revenue_store_type['Store_Type'],
              color=colors,
              alpha=.8,
              value=_revenue_store_type['Product_Store_Sales_Total'])
plt.title('Total revenue per store type')
plt.axis('off')
plt.show()

_revenue_store_type.sort_values(by='Product_Store_Sales_Total', ascending=False)

plt.figure(figsize=(6, 3))
sns.pointplot(x='Store_Id', y='Store_Establishment_Year', data=_data_copy, join=False, ci=None, marker='o', linestyles='')
plt.xlabel('ID de Tienda')
plt.ylabel('Establishment Year')
plt.title('Establishment Year - Store')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

revenue_by_store_product_type = _data_copy.groupby(['Store_Id', 'Product_Type'])['Product_Store_Sales_Total'].sum().reset_index()
stores_group1 = ['OUT001','OUT002']
stores_group2 = ['OUT003','OUT004']

df_group1 = revenue_by_store_product_type[revenue_by_store_product_type['Store_Id'].isin(stores_group1)]
df_group2 = revenue_by_store_product_type[revenue_by_store_product_type['Store_Id'].isin(stores_group2)]

# First plot
plt.figure(figsize=(16, 8))
ax1 = sns.barplot(
    data=df_group1, x='Store_Id', y='Product_Store_Sales_Total',hue='Product_Type',palette='tab20'
)
plt.title('Total product sales per store (OUT001 and OUT002)')
plt.xlabel('Store ID')
plt.ylabel('Product_Store_Sales_Total')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Product type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

for p in ax1.patches:
    height = p.get_height()
    if height > 50:
        ax1.annotate(f'{height:.0f}',
                     (p.get_x() + p.get_width() / 2., height),
                     ha='center', va='center',
                     xytext=(0, 5), textcoords='offset points',
                     fontsize=7, color='black')
plt.show()

# Second plot
plt.figure(figsize=(16, 8))
ax2 = sns.barplot(
    data=df_group2, x='Store_Id', y='Product_Store_Sales_Total',hue='Product_Type', palette='tab20'
)
plt.title('Total product sales per store (OUT003 and OUT004)')
plt.xlabel('Store ID')
plt.ylabel('Product_Store_Sales_Total')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Product type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

for p in ax2.patches:
    height = p.get_height()
    if height > 50:
        ax2.annotate(f'{height:.0f}',
                     (p.get_x() + p.get_width() / 2., height),
                     ha='center', va='center',
                     xytext=(0, 5), textcoords='offset points',
                     fontsize=7, color='black')
plt.show()

Q1 = _data_copy.select_dtypes(include=["float64", "int64"]).quantile(0.25)
Q3 = _data_copy.select_dtypes(include=["float64", "int64"]).quantile(0.75)
# Inter Quantile Range
IQR = Q3 - Q1

lower = (Q1 - 1.5 * IQR) #Values outside these bounds are outliers
upper = (Q3 + 1.5 * IQR) #Values outside these bounds are outliers

((_data_copy.select_dtypes(include=["float64", "int64"]) < lower)|(_data_copy.select_dtypes(include=["float64", "int64"]) > upper)).sum() / len(_data_copy) * 100

_data_copy['Product_Sugar_Content'].replace(to_replace=["reg"], value=["Regular"], inplace=True)

_data_copy['Product_Sugar_Content'].unique()

['Low Sugar', 'Regular', 'No Sugar']
Categories (3, object): ['Low Sugar', 'No Sugar', 'Regular']

_data_copy['_price_per_Gram'] = _data_copy['Product_MRP'] / _data_copy['Product_Weight']
_data_copy['_weight_per_area'] = _data_copy['Product_Weight'] / _data_copy['Product_Allocated_Area']

_avg_sales = _data_copy.groupby("Store_Id")["Product_Store_Sales_Total"].mean().reset_index()
_avg_sales.rename(columns={"Product_Store_Sales_Total": "_avg_sales_x_store"}, inplace=True)
_data_copy = _data_copy.merge(_avg_sales, on="Store_Id", how="left")

_data_copy.drop(['Product_Id','Store_Id','Store_Establishment_Year'], axis=1, inplace=True)

_data_copy.head()

_categorical_transformer = OneHotEncoder(handle_unknown='ignore')

_pipeline_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', _categorical_transformer, _cat)
    ]
)

X = _data_copy.drop(["Product_Store_Sales_Total"], axis=1)
y = _data_copy["Product_Store_Sales_Total"]

# Splitting data into training, validation and test set:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

# Split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=1
)
print(X_train.shape, X_val.shape, X_test.shape)

(5257, 11) (1753, 11) (1753, 11)

print("Shape of training: ", X_train.shape)
print("Shape of validation: ", X_val.shape)
print("Shape of test: ", X_test.shape)

Shape of training:  (5257, 11)
Shape of validation:  (1753, 11)
Shape of test:  (1753, 11)

_models.append(("Random Forest Regressor - Before tuning", RandomForestRegressor(random_state=1)))
_models.append(("Gradient Boost Regressor - Before tuning", GradientBoostingRegressor(random_state=1)))

_perf_train

_perf_val

_tuned_RF = RandomForestRegressor(random_state=1)
_pipeline_RF = make_pipeline(_pipeline_preprocessor, _tuned_RF)

print("Best params:", _rf_randomized_cv.best_params_)
print("Best CV RMSE:", -_rf_randomized_cv.best_score_)

Best params: {'randomforestregressor__n_estimators': 100, 'randomforestregressor__min_samples_leaf': 30, 'randomforestregressor__max_samples': 0.9, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__max_depth': 15, 'randomforestregressor__criterion': 'squared_error'}
Best CV RMSE: 609.5711902921707

best_pipeline_rf = _rf_randomized_cv.best_estimator_
best_pipeline_rf

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Product_Sugar_Content',
                                                   'Product_Type', 'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type'])])),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=15, max_features='sqrt',
                                       max_samples=0.9, min_samples_leaf=30,
                                       random_state=1))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Product_Sugar_Content',
                                                   'Product_Type', 'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type'])])),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=15, max_features='sqrt',
                                       max_samples=0.9, min_samples_leaf=30,
                                       random_state=1))])

ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['Product_Sugar_Content', 'Product_Type',
                                  'Store_Size', 'Store_Location_City_Type',
                                  'Store_Type'])])

['Product_Sugar_Content', 'Product_Type', 'Store_Size', 'Store_Location_City_Type', 'Store_Type']

OneHotEncoder(handle_unknown='ignore')

RandomForestRegressor(max_depth=15, max_features='sqrt', max_samples=0.9,
                      min_samples_leaf=30, random_state=1)

_predictions_train_rf = model_performance_regression(best_pipeline_rf, X_train, y_train)
_predictions_train_rf["Model"] = 'Random Forest Regressor - After tuning'
_perf_train = pd.concat([_perf_train,_predictions_train_rf], ignore_index=True)

_predictions_train_rf

_predictions_val_rf = model_performance_regression(best_pipeline_rf, X_val, y_val)
_predictions_val_rf["Model"] = 'Random Forest Regressor - After tuning'
_perf_val = pd.concat([_perf_val,_predictions_val_rf], ignore_index=True)

_predictions_val_rf

_tuned_GB = GradientBoostingRegressor(random_state=1)
_pipeline = make_pipeline(_pipeline_preprocessor, _tuned_GB)

# Best Hyperparameters scores (positive RMSE)
print("Best params:", _GB_randomized_cv.best_params_)
print("Best CV RMSE:", -_GB_randomized_cv.best_score_)

Best params: {'gradientboostingregressor__subsample': 0.7, 'gradientboostingregressor__n_estimators': 100, 'gradientboostingregressor__min_samples_split': 6, 'gradientboostingregressor__min_samples_leaf': 8, 'gradientboostingregressor__max_features': 'sqrt', 'gradientboostingregressor__max_depth': 3, 'gradientboostingregressor__learning_rate': 0.05}
Best CV RMSE: 610.1829332102039

best_pipeline_gb = _GB_randomized_cv.best_estimator_
best_pipeline_gb

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Product_Sugar_Content',
                                                   'Product_Type', 'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type'])])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(learning_rate=0.05,
                                           max_features='sqrt',
                                           min_samples_leaf=8,
                                           min_samples_split=6, random_state=1,
                                           subsample=0.7))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Product_Sugar_Content',
                                                   'Product_Type', 'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type'])])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(learning_rate=0.05,
                                           max_features='sqrt',
                                           min_samples_leaf=8,
                                           min_samples_split=6, random_state=1,
                                           subsample=0.7))])

ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['Product_Sugar_Content', 'Product_Type',
                                  'Store_Size', 'Store_Location_City_Type',
                                  'Store_Type'])])

['Product_Sugar_Content', 'Product_Type', 'Store_Size', 'Store_Location_City_Type', 'Store_Type']

OneHotEncoder(handle_unknown='ignore')

GradientBoostingRegressor(learning_rate=0.05, max_features='sqrt',
                          min_samples_leaf=8, min_samples_split=6,
                          random_state=1, subsample=0.7)

_predictions_train_gb = model_performance_regression(best_pipeline_gb, X_train, y_train)
_predictions_train_gb["Model"] = 'Gradient boost - After tuning'
_perf_train = pd.concat([_perf_train,_predictions_train_gb], ignore_index=True)

_predictions_train_gb

_predictions_val_gb = model_performance_regression(best_pipeline_gb, X_val, y_val)
_predictions_val_gb["Model"] = 'Gradient boost - After tuning'
_perf_val = pd.concat([_perf_val,_predictions_val_gb], ignore_index=True)

_predictions_val_gb

_perf_train

_perf_val

_predictions_test = model_performance_regression(best_pipeline_rf, X_test, y_test)
_predictions_test

saved_model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Product_Sugar_Content',
                                                   'Product_Type', 'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type'])])),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=15, max_features='sqrt',
                                       max_samples=0.9, min_samples_leaf=30,
                                       random_state=1))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Product_Sugar_Content',
                                                   'Product_Type', 'Store_Size',
                                                   'Store_Location_City_Type',
                                                   'Store_Type'])])),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=15, max_features='sqrt',
                                       max_samples=0.9, min_samples_leaf=30,
                                       random_state=1))])

ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['Product_Sugar_Content', 'Product_Type',
                                  'Store_Size', 'Store_Location_City_Type',
                                  'Store_Type'])])

['Product_Sugar_Content', 'Product_Type', 'Store_Size', 'Store_Location_City_Type', 'Store_Type']

OneHotEncoder(handle_unknown='ignore')

RandomForestRegressor(max_depth=15, max_features='sqrt', max_samples=0.9,
                      min_samples_leaf=30, random_state=1)

saved_model.predict(X_test)

array([3272.21098797, 4893.73359389, 4885.75254721, ..., 4881.93593986,
       3286.629245  , 3307.20963861])

 _input = {
        'Product_Id': _data['Product_Id'],
        'Product_Weight': _data['Product_Weight'],
        'Product_Sugar_Content': _data['Product_Sugar_Content'],
        'Product_Allocated_Area': _data['Product_Allocated_Area'],
        'Product_Type': _data['Product_Type'],
        'Product_MRP': _data['Product_MRP'],
        'Store_Id': _data['Store_Id'],
        'Store_Establishment_Year': _data['Store_Establishment_Year'],
        'Store_Size': _data['Store_Size'],
        'Store_Location_City_Type': _data['Store_Location_City_Type'],
        'Store_Type': _data['Store_Type']
        }

    # Convert the extracted data into a DataFrame
    input_data = pd.DataFrame([_input])

    # Make a prediction using the trained model
    prediction = saved_model.predict(input_data).tolist()[0]

    # Return the prediction as a JSON response
    return jsonify({'Sales forescast': prediction})

# Run the Flask app in debug mode
if __name__ == '__main__':
    _superkart_api.run(debug=True)

Overwriting backend_files/app.py

%%writefile backend_files/requirements.txt
numpy==2.0.2
pandas==2.2.2
scikit-learn==1.6.1
joblib==1.4.2
xgboost==2.1.4
requests==2.32.3
Werkzeug==2.2.2
flask==2.2.2
gunicorn==20.1.0
uvicorn[standard]
streamlit==1.43.2

Overwriting backend_files/requirements.txt

%%writefile backend_files/Dockerfile
# Use a minimal base image with Python 3.9 installed
FROM python:3.9-slim

# Set the working directory inside the container to /app
WORKDIR /app

# Copy all files from the current directory on the host to the container's /app directory
COPY . .

# Install Python dependencies listed in requirements.txt
RUN pip3 install --no-cache-dir --upgrade -r requirements.txt

# Define the command to start the application using Gunicorn with 4 worker processes
# - `-w 4`: Uses 4 worker processes for handling requests
# - `-b 0.0.0.0:7860`: Binds the server to port 7860 on all network interfaces
# - `app:app`: Runs the Flask app (assuming `app.py` contains the Flask instance named `app`)
CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:7860", "app:_superkart_api"]

Overwriting backend_files/Dockerfile

Uploading...:   0%|          | 0.00/413k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/spaces/JosueQB12/superkart/commit/39fcb5d1ee278f67be5d97c535fa40ef17df0594', commit_message='Upload folder using huggingface_hub', commit_description='', oid='39fcb5d1ee278f67be5d97c535fa40ef17df0594', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spaces/JosueQB12/superkart', endpoint='https://huggingface.co', repo_type='space', repo_id='JosueQB12/superkart'), pr_revision=None, pr_num=None)

# Input fields
Product_Id = st.text_input("Product Id.", max_chars=10, help="Enter the product Id.")
Product_Weight = st.number_input("Product Weight", min_value=0.0, help="Enter the product weight (lbs).")
Product_Sugar_Content = st.selectbox("Product sugar content", ["Low Sugar", "Regular", "No Sugar"], help="Enter the product sugar content.")
Product_Allocated_Area = st.number_input("Product allocated area", min_value=0.0, help="Enter the ratio of the allocated display area of the product.")
Product_Type = st.selectbox("Product type", ["Baking Goods","Breads","Breakfast","Canned","Dairy","Frozen Foods","Fruits and Vegetables","Hard Drinks",
                                             "Health and Hygiene","Household","Meat","Others","Seafood","Snack Foods","Soft Drinks","Starchy Foods"],
                            help="Enter the product sugar content.")
Product_MRP = st.number_input("Product MRP", min_value=0.0, help="Enter the maximum retail price of the product.")
Store_Id = st.text_input("Store Id.", max_chars=10, help="Enter the store Id.")
Store_Establishment_Year = st.number_input("Store establishment year", min_value=1987, max_value=2025, help="Enter the year in which the store was established.")
Store_Size = st.selectbox("Store size", ["High", "Medium", "Small"], help="Enter the size of the store.")
Store_Location_City_Type = st.selectbox("Store size", ["Tier 1", "Tier 2", "Tier 3"], help="Enter the type of city in which the store is located.")
Store_Type = st.selectbox("Store type", ["Departmental Store", "Food Mart","Store_Type","Supermarket Type1","Supermarket Type2"],
                          help="Enter the type of store depending on the products that are being sold.")

product_data = {
    "Product_Id": Product_Id,
    "Product_Weight": Product_Weight,
    "Product_Sugar_Content": Product_Sugar_Content,
    "Product_Allocated_Area": Product_Allocated_Area,
    "Product_Type": Product_Type,
    "Product_MRP": Product_MRP,
    "Store_Id": Store_Id,
    "Store_Establishment_Year": Store_Establishment_Year,
    "Store_Size": Store_Size,
    "Store_Location_City_Type": Store_Location_City_Type,
    "Store_Type": Store_Type
}

if st.button("Predict", type='primary'):
    response = requests.post("https://josueqb12-superkart.hf.space/v1/forecast", json=product_data)
    if response.status_code == 200:
        result = response.json()
        _sales_forecast = result["Sales forescast"]
        st.write(f"Based on the information provided, the total predicted product store sales are: {_sales_forecast:.2f}.")
    else:
        st.error("Error in API request")

Overwriting frontend_files/app.py

%%writefile frontend_files/requirements.txt
pandas==2.2.2
requests==2.32.3
streamlit==1.45.0

Overwriting frontend_files/requirements.txt

%%writefile frontend_files/Dockerfile
# Use a minimal base image with Python 3.9 installed
FROM python:3.9-slim

# Set the working directory inside the container to /app
WORKDIR /app

# Copy all files from the current directory on the host to the container's /app directory
COPY . .

# Install Python dependencies listed in requirements.txt
RUN pip3 install -r requirements.txt

# Define the command to run the Streamlit app on port 8501 and make it accessible externally
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]

# NOTE: Disable XSRF protection for easier external access in order to make batch predictions

Overwriting frontend_files/Dockerfile

CommitInfo(commit_url='https://huggingface.co/spaces/JosueQB12/superkart_frontend/commit/d6d162f0d45f545aac1cf2fc4790fe7e31497718', commit_message='Upload folder using huggingface_hub', commit_description='', oid='d6d162f0d45f545aac1cf2fc4790fe7e31497718', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spaces/JosueQB12/superkart_frontend', endpoint='https://huggingface.co', repo_type='space', repo_id='JosueQB12/superkart_frontend'), pr_revision=None, pr_num=None)

_url = "https://josueqb12-superkart.hf.space/v1/forecast"

payload = {
    "Product_Id": "FD1961",
    "Product_Weight": 9.45,
    "Product_Sugar_Content": "Low Sugar",
    "Product_Allocated_Area": 0.047,
    "Product_Type": "Snack Foods",
    "Product_MRP": 95.95,
    "Store_Id": "OUT002",
    "Store_Establishment_Year": 1998,
    "Store_Size": "Small",
    "Store_Location_City_Type": "Tier 3",
    "Store_Type": "Food Mart"
}

response = requests.post(_url, json=payload)
response

<Response [200]>

print(response.json())

{'Sales forescast': 1736.006160411349}

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
count	8763	8763.000000	8763	8763.000000	8763	8763.000000	8763	8763.000000	8763	8763	8763	8763.000000
unique	8763	NaN	4	NaN	16	NaN	4	NaN	3	3	4	NaN
top	FD306	NaN	Low Sugar	NaN	Fruits and Vegetables	NaN	OUT004	NaN	Medium	Tier 2	Supermarket Type2	NaN
freq	1	NaN	4885	NaN	1249	NaN	4676	NaN	6025	6262	4676	NaN
mean	NaN	12.653792	NaN	0.068786	NaN	147.032539	NaN	2002.032751	NaN	NaN	NaN	3464.003640
std	NaN	2.217320	NaN	0.048204	NaN	30.694110	NaN	8.388381	NaN	NaN	NaN	1065.630494
min	NaN	4.000000	NaN	0.004000	NaN	31.000000	NaN	1987.000000	NaN	NaN	NaN	33.000000
25%	NaN	11.150000	NaN	0.031000	NaN	126.160000	NaN	1998.000000	NaN	NaN	NaN	2761.715000
50%	NaN	12.660000	NaN	0.056000	NaN	146.740000	NaN	2009.000000	NaN	NaN	NaN	3452.340000
75%	NaN	14.180000	NaN	0.096000	NaN	167.585000	NaN	2009.000000	NaN	NaN	NaN	4145.165000
max	NaN	22.000000	NaN	0.298000	NaN	266.000000	NaN	2009.000000	NaN	NaN	NaN	8000.000000

	proportion
Store_Establishment_Year
2009	53.4
1999	15.4
1987	18.1
1998	13.1

	Store_Establishment_Year	Product_Store_Sales_Total
3	2009	15427583.43
2	1999	6673457.57
0	1987	6223113.18
1	1998	2030909.72

	RMSE	MAE	R-squared	Adj. R-squared	MAPE	Model
0	599.302247	471.734564	0.683112	0.682447	0.166443	Random Forest Regressor - Before tuning
1	602.784108	474.329441	0.679419	0.678747	0.167860	Gradient Boost Regressor - Before tuning

	RMSE	MAE	R-squared	Adj. R-squared	MAPE	Model
0	589.020035	459.391759	0.690892	0.688939	0.188043	Random Forest Regressor - Before tuning
1	576.429627	451.981812	0.703966	0.702095	0.186927	Gradient Boost Regressor - Before tuning

	Product_Id	Product_Weight	Product_Sugar_Content	Product_Allocated_Area	Product_Type	Product_MRP	Store_Id	Store_Establishment_Year	Store_Size	Store_Location_City_Type	Store_Type	Product_Store_Sales_Total
0	FD6114	12.66	Low Sugar	0.027	Frozen Foods	117.08	OUT004	2009	Medium	Tier 2	Supermarket Type2	2842.40
1	FD7839	16.54	Low Sugar	0.144	Dairy	171.43	OUT003	1999	Medium	Tier 1	Departmental Store	4830.02
2	FD5075	14.28	Regular	0.031	Canned	162.08	OUT001	1987	High	Tier 2	Supermarket Type1	4130.16
3	FD8233	12.10	Low Sugar	0.112	Baking Goods	186.31	OUT001	1987	High	Tier 2	Supermarket Type1	4132.18
4	NC1180	9.57	No Sugar	0.010	Health and Hygiene	123.67	OUT002	1998	Small	Tier 3	Food Mart	2279.36

	proportion
Product_Sugar_Content
Low Sugar	55.7
No Sugar	17.3
Regular	25.7
reg	1.2

	proportion
Product_Type
Baking Goods	8.2
Breads	2.3
Breakfast	1.2
Canned	7.7
Dairy	9.1
Frozen Foods	9.3
Fruits and Vegetables	14.3
Hard Drinks	2.1
Health and Hygiene	7.2
Household	8.4
Meat	7.1
Others	1.7
Seafood	0.9
Snack Foods	13.1
Soft Drinks	5.9
Starchy Foods	1.6

	Product_Type	Product_Store_Sales_Total
6	Fruits and Vegetables	4300833.27
13	Snack Foods	3988996.95
4	Dairy	2811918.04
5	Frozen Foods	2809980.83
9	Household	2564740.17
0	Baking Goods	2452986.00
3	Canned	2300082.71
8	Health and Hygiene	2163707.21
10	Meat	2129211.94
14	Soft Drinks	1797044.72
1	Breads	714942.24
7	Hard Drinks	625814.62
11	Others	541496.30
15	Starchy Foods	518774.45
2	Breakfast	362130.41
12	Seafood	272404.04

	0
Product_Weight	0.616227
Product_Allocated_Area	1.186808
Product_MRP	0.650462
Store_Establishment_Year	0.000000
Product_Store_Sales_Total	1.357982

Forecasting Sales Revenue¶

Machine Learning: Random Forest Regressor, Gradient Boosting Regressor¶

+ RandomizedSearch + HugginFace + jobLib + Docker + Flask + StreamLit¶

Josue Quiros Batista¶

Business Context¶

Objective¶

Actionable Insights and Business Recommendations derived from this model¶

Insights from the analysis conducted¶

Business recommendations¶

Used libraries¶

Data Overview¶

Shape¶

Data types¶

Statistical summary¶

Observations¶

Observations:¶

Observations¶

Observations¶

Observations¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Bivariate Analysis¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Data Preprocessing¶

Feature engineering¶

Observations¶

Observations¶

Observations¶

Additional observations on feature engineering¶

Model Building¶

Metric¶

Models: Random Forest Regressor and Gradient Boosting Regressor¶

Tasks¶

Observations¶

Model Performance Improvement - Hyperparameter Tuning¶

Hyperparameters to consider¶

Hyperparameters to consider¶

Observations¶

Model Performance Comparison, Final Model Selection, and Serialization¶

Performance comparison - Training¶

Observations¶

Performance comparison - Validation¶

Observations¶

Final model selection¶

Performance check on the test set¶

Observations¶

Serialization¶

Tasks¶

Deployment - Backend¶

Flask Web Framework¶

Tasks¶

Dependencies File¶

Dockerfile¶

Setting up a Hugging Face Docker Space for the Backend¶

Uploading Files to Hugging Face Space (Docker Space)¶

Tasks¶

Share the link of the Hugging Face space - Backend¶