# A user wants to compute the entropy of 0-1 vector many times. Is there a faster way to do this?

831    Asked by SanjanaShah in Data Science , Asked on Nov 17, 2019

He used the following code

def entropy(labels):

""" Computes entropy of 0-1 vector. """

n_labels = len(labels)

if n_labels <= 1:

return 0

counts = np.bincount(labels)

probs = counts[np.nonzero(counts)] / n_labels

n_classes = len(probs)

if n_classes <= 1:

return 0

return - np.sum(probs * np.log(probs)) / np.log(n_classes)

We can use four different approaches and see which library performs faster.Those approaches are

scipy/numpy, numpy/math, pandas/numpy, Numpy

import numpy as np

from scipy.stats import entropy

from math import log, e

import pandas as pd

import timeit

def entropy1(labels, base=None):

value,counts = np.unique(labels, return_counts=True)

return entropy(counts, base=base)

def entropy2(labels, base=None):

""" Computes entropy of label distribution. """

n_labels = len(labels)

if n_labels <= 1:

return 0

value,counts = np.unique(labels, return_counts=True)

probs = counts / n_labels

n_classes = np.count_nonzero(probs)

if n_classes <= 1:

return 0

ent = 0.

# Compute entropy

base = e if base is None else base

for i in probs:

ent -= i * log(i, base)

return ent

def entropy3(labels, base=None):

vc = pd.Series(labels).value_counts(normalize=True, sort=False)

base = e if base is None else base

return -(vc * np.log(vc)/np.log(base)).sum()

def entropy4(labels, base=None):

value,counts = np.unique(labels, return_counts=True)

norm_counts = counts / counts.sum()

base = e if base is None else base

return -(norm_counts * np.log(norm_counts)/np.log(base)).sum()

Now lets input the timeit operations

repeat_number = 1000000

a = timeit.repeat(stmt='''entropy1(labels)''',

setup='''labels=[1,3,5,2,3,5,3,2,1,3,4,5];from __main__ import entropy1''',

repeat=3, number=repeat_number)

b = timeit.repeat(stmt='''entropy2(labels)''',

setup='''labels=[1,3,5,2,3,5,3,2,1,3,4,5];from __main__ import entropy2''',

repeat=3, number=repeat_number)

c = timeit.repeat(stmt='''entropy3(labels)''',

setup='''labels=[1,3,5,2,3,5,3,2,1,3,4,5];from __main__ import entropy3''',

repeat=3, number=repeat_number)

d = timeit.repeat(stmt='''entropy4(labels)''',

setup='''labels=[1,3,5,2,3,5,3,2,1,3,4,5];from __main__ import entropy4''',

repeat=3, number=repeat_number)

Here is the timeit results

# for loop to print out results of timeit

for approach,timeit_results in zip(['scipy/numpy', 'numpy/math', 'pandas/numpy', 'numpy'], [a,b,c,d]):

print('Method: {}, Avg.: {:.6f}'.format(approach, np.array(timeit_results).mean()))

Method: scipy/numpy, Avg.: 63.315312

Method: numpy/math, Avg.: 49.256894

Method: pandas/numpy, Avg.: 884.644023

Method: numpy, Avg.: 60.026938