# Explain how to implement Naïve Bayes in Python from scratch.

319    Asked by LarryHuffer in Data Science , Asked on Dec 17, 2019

To implement Naïve Bayes from scratch, let us take the following example.

Initially we import the following libraries and the dataset.

'''libraries'''

import numpy as np

from math import sqrt

from math import pi

from math import exp

'''linnerud dataset'''

Now we select features and target variables

'''attributes and target'''

data = linnerud['target']

target = linnerud['data']

Let us convert target labels based on the following condition

'''median of chinup'''

outcome = []

for i in target:

outcome.append(i[0])

median = np.sum(outcome) / 20

'''if chinup>median(chinup),return 0 else 1'''

label = []

for x in outcome:

#print(x)

if x > median:

label.append(0)

else:

label.append(1)

Now we split the data for training and testing

'''split data and return a dict'''

def separateclass(dataset):

separated = dict()

for i in range(len(dataset)):

vector = dataset[i]

class_value = vector[-1]

if (class_value not in separated):

separated[class_value] = list()

separated[class_value].append(vector)

return separated

Now we define the following functions

'''mean and stdev'''

def mean(numbers):

return sum(numbers)/float(len(numbers))

def stdev(numbers):

avg = mean(numbers)

variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)

return sqrt(variance)

'''mean,stdev,count'''

def summarize_dataset(dataset):

summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]

del(summaries[-1])

return summaries

'''calculation of statistics'''

def summarization(dataset):

separated = separateclass(dataset)

summaries = dict()

for class_value, rows in separated.items():

summaries[class_value] = summarize_dataset(rows)

return summaries

Finally we will create the Bayes function

'''gauss naive bayes func'''

def gaussian_probab_func(x, mean, stdev):

exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))

return (1 / (sqrt(2 * pi) * stdev)) * exponent

'''probab func'''

def probab_pred(summaries, row):

total_rows = sum([summaries[label][0][2] for label in summaries])

probabilities = dict()

for class_value, class_summaries in summaries.items():

probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)

for i in range(len(class_summaries)):

mean, stdev, _ = class_summaries[i]

probabilities[class_value] *= gaussian_probab_func(row[i], mean, stdev)

return probabilities

Let us put the data and find out the summaries.

'''probab of data'''

linnerud_data =[[191., 36., 50.,1],

[189., 37., 52.,1],

[193., 38., 58.,0],

[162., 35., 62.,0],

[189., 35., 46.,0],

[182., 36., 56.,1],

[211., 38., 56.,1],

[167., 34., 60.,1],

[176., 31., 74.,0],

[154., 33., 56.,0],

[169., 34., 50.,0],

[166., 33., 52.,0],

[154., 34., 64.,0],

[247., 46., 50.,1],

[193., 36., 46.,1],

[202., 37., 62.,0],

[176., 37., 54.,1],

[157., 32., 52.,0],

[156., 33., 54.,0],

[138., 33., 68.,1]]

summaries = summarization(linnerud_data)