-
Notifications
You must be signed in to change notification settings - Fork 0
/
naive_bayes.py
169 lines (143 loc) · 6.55 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import numpy as np
import math
class NaiveBayes(object):
def __init__(self,num_class,feature_dim,num_value):
"""Initialize a naive bayes model.
This function will initialize prior and likelihood, where
prior is P(class) with a dimension of (# of class,)
that estimates the empirical frequencies of different classes in the training set.
likelihood is P(F_i = f | class) with a dimension of
(# of features per example, # of possible values per feature, # of class),
that computes the probability of every feature location i being value f for every class label.
Args:
num_class(int): number of classes to classify (ex: num of unique picture labels)
feature_dim(int): feature dimension for each example (ex: num of pixels per picture)
num_value(int): max number of possible values for any feature (ex: num of colors in a pixel)
"""
self.num_value = num_value
self.num_class = num_class
self.feature_dim = feature_dim
self.label_table = dict()
self.real_labels = dict()
self.num_examples = 0
self.prior_counts = np.zeros((num_class))
self.prior = np.zeros((num_class)) ##P(class)
self.likelihood_counts = np.zeros((feature_dim, num_value, num_class))
self.likelihood = np.zeros((feature_dim, num_value, num_class)) #P(Fi | class)
#P(Fi = f | class) = (# of times featuref i has value f in training examples from this class) / (Total # of training examples from this class)
def train(self,train_set,train_label):
""" Train naive bayes model (self.prior and self.likelihood) with training dataset.
self.prior(numpy.ndarray): training set class prior (in log) with a dimension of (# of class,),
self.likelihood(numpy.ndarray): traing set likelihood (in log) with a dimension of
(# of features per image, # of possible values per feature, # of class).
You should apply Laplace smoothing to compute the likelihood.
Args:
train_set(numpy.ndarray): training examples with a dimension of (# of examples, feature_dim)
train_label(numpy.ndarray): training labels with a dimension of (# of examples)
"""
# set variables for for loop ranges
#print(train_set.shape)
#print(train_label.shape)
laplace_smoothing_value = 1
self.num_examples = train_set.shape[0]
number_of_features = train_set.shape[1]
# index value for hash table starting at zero
count = 0
# preprocessing to find all labels that are in training set to
# associate a label name with an index number, no guarantee that
# number of unqiue examples seen is equal to num_classes
for example in range(self.num_examples):
if train_label[example] not in self.label_table:
self.label_table[train_label[example]] = count
count += 1
#print(example)
#print(self.label_table)
self.prior_counts[self.label_table[train_label[example]]] += 1
for key in self.label_table:
self.real_labels[self.label_table[key]] = key
# print(self.label_table)
# print(self.index_of_labels)
# loop through each example, and the features of each example
# feature_dim = pixel number 0-784
# num_value = value of pixel, 0-255
# num_classes = index of class
# self.likelihood[feature_dim][num_value][num_class]
for example in range(self.num_examples):
for feature in range(number_of_features):
feature_value = train_set[example][feature]
self.likelihood_counts[feature][feature_value][self.label_table[train_label[example]]] += 1
self.likelihood_counts += laplace_smoothing_value
self.prior_counts += laplace_smoothing_value
self.prior = np.copy(self.prior_counts)
self.likelihood = np.copy(self.likelihood_counts)
for sample in range(self.num_class):
np.divide(self.likelihood[:,:,sample], self.prior[sample])
np.divide(self.prior, self.num_examples)
def test(self,test_set,test_label):
""" Test the trained naive bayes model (self.prior and self.likelihood) on testing dataset,
by performing maximum a posteriori (MAP) classification.
The accuracy is computed as the average of correctness
by comparing between predicted label and true label.
Args:
test_set(numpy.ndarray): testing examples with a dimension of (# of examples, feature_dim)
test_label(numpy.ndarray): testing labels with a dimension of (# of examples, )
Returns:
accuracy(float): average accuracy value
pred_label(numpy.ndarray): predicted labels with a dimension of (# of examples, )
"""
number_of_examples = test_set.shape[0]
number_of_features = test_set.shape[1]
best_prediction = 0
correct_predictions = 0
# YOUR CODE HERE
accuracy = 0
pred_label = np.zeros((len(test_set)))
for example in range(number_of_examples):
for classes in range(len(self.prior)):
current_prediction = math.log(self.prior[classes])
for feature in range(number_of_features):
feature_value = test_set[example][feature]
current_prediction += math.log(self.likelihood[feature][feature_value][classes])
if current_prediction > best_prediction:
best_prediction = current_prediction
#predicted_label = self.index_of_labels[classes]
predicted_label = classes
pred_label[example] = self.real_labels[predicted_label]
best_prediction = 0
for pred in range(len(pred_label)):
if pred_label[pred] == test_label[pred]:
correct_predictions += 1
accuracy = correct_predictions / number_of_examples
#pass
print(accuracy)
return accuracy, pred_label
def guess(self,test_set):
"""
test_set(numpy.ndarray): current play with a dimension of (feature_dim)
"""
best_prediction = 0
correct_predictions = 0
number_of_features = test_set.shape[0]
for classes in range(len(self.prior)):
current_prediction = math.log(self.prior[classes])
for feature in range(number_of_features):
feature_value = test_set[feature]
current_prediction += math.log(self.likelihood[feature][feature_value][classes])
if current_prediction > best_prediction:
best_prediction = current_prediction
#predicted_label = self.index_of_labels[classes]
predicted_label = self.real_labels[classes]
print(str(classes) + " prediction: " + str(current_prediction))
return predicted_label
def update(self, test_set, test_label):
number_of_features = test_set.shape[0]
self.num_examples += 1
self.prior_counts[self.label_table[test_label]] += 1
for feature in range(number_of_features):
feature_value = test_set[feature]
self.likelihood_counts[feature][feature_value][test_label] += 1
self.prior = np.copy(self.prior_counts)
self.likelihood = np.copy(self.likelihood_counts)
for sample in range(self.num_class):
np.divide(self.likelihood[:,:,sample], self.prior[sample])
np.divide(self.prior, self.num_examples)