Explain how to implement Naïve Bayes in Python from scratch.
To implement Naïve Bayes from scratch, let us take the following example.
Initially we import the following libraries and the dataset.
'''libraries'''
import numpy as np
from math import sqrt
from math import pi
from math import exp
'''linnerud dataset'''
from sklearn.datasets import load_linnerud
linnerud=load_linnerud()
Now we select features and target variables
'''attributes and target'''
data = linnerud['target']
target = linnerud['data']
Let us convert target labels based on the following condition
'''median of chinup'''
outcome = []
for i in target:
outcome.append(i[0])
median = np.sum(outcome) / 20
'''if chinup>median(chinup),return 0 else 1'''
label = []
for x in outcome:
#print(x)
if x > median:
label.append(0)
else:
label.append(1)
Now we split the data for training and testing
'''split data and return a dict'''
def separateclass(dataset):
separated = dict()
for i in range(len(dataset)):
vector = dataset[i]
class_value = vector[-1]
if (class_value not in separated):
separated[class_value] = list()
separated[class_value].append(vector)
return separated
Now we define the following functions
'''mean and stdev'''
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
return sqrt(variance)
'''mean,stdev,count'''
def summarize_dataset(dataset):
summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
del(summaries[-1])
return summaries
'''calculation of statistics'''
def summarization(dataset):
separated = separateclass(dataset)
summaries = dict()
for class_value, rows in separated.items():
summaries[class_value] = summarize_dataset(rows)
return summaries
Finally we will create the Bayes function
'''gauss naive bayes func'''
def gaussian_probab_func(x, mean, stdev):
exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
return (1 / (sqrt(2 * pi) * stdev)) * exponent
'''probab func'''
def probab_pred(summaries, row):
total_rows = sum([summaries[label][0][2] for label in summaries])
probabilities = dict()
for class_value, class_summaries in summaries.items():
probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
for i in range(len(class_summaries)):
mean, stdev, _ = class_summaries[i]
probabilities[class_value] *= gaussian_probab_func(row[i], mean, stdev)
return probabilities
Let us put the data and find out the summaries.
'''probab of data'''
linnerud_data =[[191., 36., 50.,1],
[189., 37., 52.,1],
[193., 38., 58.,0],
[162., 35., 62.,0],
[189., 35., 46.,0],
[182., 36., 56.,1],
[211., 38., 56.,1],
[167., 34., 60.,1],
[176., 31., 74.,0],
[154., 33., 56.,0],
[169., 34., 50.,0],
[166., 33., 52.,0],
[154., 34., 64.,0],
[247., 46., 50.,1],
[193., 36., 46.,1],
[202., 37., 62.,0],
[176., 37., 54.,1],
[157., 32., 52.,0],
[156., 33., 54.,0],
[138., 33., 68.,1]]
summaries = summarization(linnerud_data)