# Import necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For downloading datasets from Kaggle
import kaggle

# Download the dataset
kaggle.api.dataset_download_files("govindaramsriram/energy-consumption-dataset-linear-regression", path="./datasets/", unzip=True)

Dataset URL: https://www.kaggle.com/datasets/govindaramsriram/energy-consumption-dataset-linear-regression

# Load train and test datasets
df_train = pd.read_csv('./datasets/train_energy_data.csv')
df_test = pd.read_csv('./datasets/test_energy_data.csv')

# merge both datasets
df = pd.concat([df_train, df_test], axis=0)

# Preview the data
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1100 non-null   object 
 1   Square Footage       1100 non-null   int64  
 2   Number of Occupants  1100 non-null   int64  
 3   Appliances Used      1100 non-null   int64  
 4   Average Temperature  1100 non-null   float64
 5   Day of Week          1100 non-null   object 
 6   Energy Consumption   1100 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 68.8+ KB

df.describe()

# Check unique values in each categorical column

for col in df.select_dtypes(include='object').columns:
    print(f"{col}: {df[col].unique()}")

Building Type: ['Residential' 'Commercial' 'Industrial']
Day of Week: ['Weekday' 'Weekend']

new_cols = ['building_type', 'square_footage', 'number_of_occupants', 'appliances_used', 'average_temperature', 'day_of_week', 'energy_consumption']

df.columns = new_cols

df.head()

# Encoding for `building_type` column
df = pd.get_dummies(df, columns=['building_type'], drop_first=True)

# Encoding for `day_of week` column
df = pd.get_dummies(df, columns=['day_of_week'], drop_first=True)

# convert to integer
df[['building_type_Industrial', 'building_type_Residential', 'day_of_week_Weekend']] = df[['building_type_Industrial', 'building_type_Residential', 'day_of_week_Weekend']].astype(int)

df.head()

X = df.drop(columns=['energy_consumption'])
y = df['energy_consumption']

# Perform scaling for all numerical columns

num_cols = X.select_dtypes(include='number').columns

for col in num_cols:
    X[col] = (X[col] - X[col].mean()) / X[col].std()

# Preview the data
X.head()

# Split the data into train and test sets

# Get index for splitting the data 80% train
train_split_index = int(0.8 * len(df))

# Split data into train and test
train_data_X = X.iloc[:train_split_index]
test_data_X = X.iloc[train_split_index:]

train_data_y = y.iloc[:train_split_index]
test_data_y = y.iloc[train_split_index:]


# Get X and Y values of the test and train models
X_train = train_data_X[X.columns]
y_train = train_data_y.values
X_test = test_data_X[X.columns]
y_test = test_data_y.values

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((880, 7), (880,), (220, 7), (220,))

# Add a column of ones to X_train for the bias term
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]

# Compute different parts of closed form Solution
XTX_inv = np.linalg.inv(X_train.T @ X_train)
XTy = X_train.T @ y_train

# Compute Theta (Coefficients)
coefficients = XTX_inv @ XTy

features_intercept_list = ['intercept'] + list(df.columns.drop('energy_consumption'))

# Print the computed coefficients
for feature, weight in zip(features_intercept_list, coefficients):
    print(f'{feature} : {weight}')

intercept : 4168.191390734602
square_footage : 711.8474075727273
number_of_occupants : 291.276209188346
appliances_used : 282.324251173425
average_temperature : -35.61210470642426
building_type_Industrial : 233.1683008733712
building_type_Residential : -238.8776550204007
day_of_week_Weekend : -25.011398286890056

# Compute y_pred using the equation
y_pred = X_test @ coefficients

# Preview y_pred
y_pred[:10]

array([2924.84992404, 2495.95032149, 4873.00038804, 5088.20019472,
       5518.15034471, 3542.20047749, 3372.40042206, 4895.60064123,
       3771.10044665, 1915.34997022])

# Calculate MAE
mae = np.mean(np.abs(y_test - y_pred))
mae

np.float64(0.011712122589022494)

# Calculate MSE
mse = np.mean((y_test - y_pred)**2)
mse

np.float64(0.00018989334638738073)

# Calculate R²
ss_res = np.sum((y_test - y_pred)**2)                         # Residual Sum of Squares
ss_tot = np.sum((y_test - np.mean(y_test))**2)                # Total Sum of Squares
r2_score = 1 - (ss_res / ss_tot)

r2_score

np.float64(0.9999999997701359)

	Square Footage	Number of Occupants	Appliances Used	Average Temperature	Energy Consumption
count	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000
mean	25500.527273	48.268182	25.730000	22.559745	4168.191273
std	14236.955632	29.127624	14.116209	7.122357	924.278723
min	560.000000	1.000000	1.000000	10.050000	1683.950000
25%	13203.750000	22.000000	13.000000	16.365000	3510.460000
50%	25785.500000	47.000000	26.000000	22.810000	4189.690000
75%	37536.750000	73.000000	38.000000	28.760000	4859.510000
max	49997.000000	99.000000	49.000000	34.990000	6530.600000

	square_footage	number_of_occupants	appliances_used	average_temperature	building_type_Industrial	building_type_Residential	day_of_week_Weekend
0	-1.295047	0.952080	-1.114322	1.022169	-0.684251	1.356725	-0.997730
1	1.325527	0.608763	1.365097	-0.819918	-0.684251	-0.736399	-0.997730
2	-0.438684	-0.386856	-0.618438	-1.159693	1.460124	-0.736399	1.001364
3	-0.859420	-1.176484	1.081735	1.440570	-0.684251	1.356725	-0.997730
4	-0.851694	-0.764504	-0.547597	-1.493852	-0.684251	-0.736399	-0.997730

Linear Regression from Scratch: Understanding the Mathematics Behind the Model¶

Introduction¶

The Mathematical Foundation of Linear Regression¶

Deriving The Closed Form Equation¶

Implementing Linear Regression from Closed Form Equation¶

Load the Dataset¶

Explore the dataset¶

Data Pre-Processing¶

Encoding: Using `pd.get_dummies()`¶

Scaling: Using Standard Scaling Method¶

Implementing the Model¶

Evaluating the model's performance¶

Utilize the learned coefficients to generate predictions on the test dataset split, where:¶

Drawbacks of the Closed-Form Equation¶

	Building Type	Square Footage	Number of Occupants	Appliances Used	Average Temperature	Day of Week	Energy Consumption
0	Residential	7063	76	10	29.84	Weekday	2713.95
1	Commercial	44372	66	45	16.72	Weekday	5744.99
2	Industrial	19255	37	17	14.30	Weekend	4101.24
3	Residential	13265	14	41	32.82	Weekday	3009.14
4	Commercial	13375	26	18	11.92	Weekday	3279.17

Linear Regression from Scratch: Understanding the Mathematics Behind the Model¶

Introduction¶

The Mathematical Foundation of Linear Regression¶

Deriving The Closed Form Equation¶

Implementing Linear Regression from Closed Form Equation¶

Load the Dataset¶

Explore the dataset¶

Data Pre-Processing¶

Encoding: Using pd.get_dummies()¶

Scaling: Using Standard Scaling Method¶

Implementing the Model¶

Evaluating the model's performance¶

Utilize the learned coefficients to generate predictions on the test dataset split, where:¶

Drawbacks of the Closed-Form Equation¶

Encoding: Using `pd.get_dummies()`¶