I'm trying to implement the deep belief network for the Semantic Hashing article (http://www.cs.toronto.edu/~hinton/absps/sh.pdf) by Geoffrey Hinton and Ruslan Salakhutdinov. I have a hard time figuring out how to implement the constrained poisson model in the restricted boltzmann machine (RBM), so that my model will take real valued word-count vectors and update the weights correctly?
Below you find the essential code for my RBM:
for epoch in range(epochs):
errsum = 0
batch_index = 0
for _ in self.batches:
# Positive phase - generate data from visible to hidden units.
pos_vis = self.__get_input_data__(batch_index)
N = sum(pos_vis, axis = 1)[newaxis].T
batch_size = len(pos_vis)
if self.final_layer:
pos_hid_prob = dot(pos_vis,self.weights) + tile(self.hidden_biases,(batch_size,1))
elif self.first_layer:
pos_hid_prob = dbn.bernoulli(dot(pos_vis,self.weights) + tile(self.hidden_biases,(batch_size,1)))
else:
pos_hid_prob = dbn.bernoulli(dot(pos_vis,self.weights) + tile(self.hidden_biases,(batch_size,1)))
pos_values = dot(pos_vis.T,pos_hid_prob)
pos_hid_act = sum(pos_hid_prob,axis = 0)
pos_vis_act = sum(pos_vis,axis = 0)
# If probabilities are higher than random generated, the states are 1
if self.final_layer:
pos_hid = pos_hid_prob + randn(batch_size,self.num_hid)
else:
randoms = rand(batch_size,self.num_hid)
pos_hid = array(randoms < pos_hid_prob,dtype = int)
# Negative phase - generate data from hidden to visible units and then again to hidden units.
if self.first_layer: # To represent the count data in the visible units for the first layer RBM we use the constrained poisson model.
neg_vis = dbn.constrained_poisson(pos_hid, self.visible_biases, self.weights.T, batch_size, pos_vis)
else: # For all RBM's besides the first where the data are binary.
neg_vis = dbn.bernoulli(dot(pos_hid,self.weights.T) + tile(self.visible_biases,(batch_size,1)))
if self.final_layer:
neg_hid_prob = dot(neg_vis,self.weights) + tile(self.hidden_biases,(batch_size,1))
else:
neg_hid_prob = dbn.bernoulli(dot(neg_vis,self.weights) + tile(self.hidden_biases,(batch_size,1)))
neg_values = dot(neg_vis.T, neg_hid_prob)
neg_hid_act = sum(neg_hid_prob,axis=0)
neg_vis_act = sum(neg_vis,axis=0)
# Set the error
errsum += sum((pos_vis-neg_vis)**2)
# Update weights and biases
self.delta_weights = self.learning_momentum * self.delta_weights + self.epsilon_weights*((pos_values-neg_values)/batch_size - self.weight_cost*self.weights)
self.delta_visible_biases = self.learning_momentum * self.delta_visible_biases + (self.epsilon_visibleBiases/batch_size) * (pos_vis_act-neg_vis_act)
self.delta_hidden_biases = self.learning_momentum * self.delta_hidden_biases + (self.epsilon_hiddenBiases/batch_size) * (pos_hid_act-neg_hid_act)
self.weights += self.delta_weights
self.visible_biases += self.delta_visible_biases
self.hidden_biases += self.delta_hidden_biases
batch_index += 1
print 'Epoch ',epoch+1,' Error ',errsum
Where the methods for the bernoulli distribution and the constrained poisson model are stated below:
def constrained_poisson(states_hidden_units, biases_visible_units,weights, number_of_docs_in_batch,states_visible_units):
'''
Calculating the probabilities according to the constrained
Poisson model.
Parameters
----------
states_hidden_units: the binary values of the hidden units.
weights: the weight matrices.
number_of_docs_in_batch: number of documents in the batch.
Returns
__________
The batch after calculating the contrained poisson values.
'''
_lambda = dot(states_hidden_units,weights)+tile(biases_visible_units,(number_of_docs_in_batch,1))
# If overflow is encountered in exp.
m = amax(_lambda,axis=1)[newaxis].T
_lambda = exp(_lambda-m)
# Divide each element by the sum of the words in the document.
num_of_rows = sum(_lambda,axis = 1)
for i in range(len(_lambda)):
for j in range(len(_lambda[i])):
_lambda[i,j] = _lambda[i,j]/num_of_rows[i]
# Compute the sum of each row in the initial input data.
N = array(sum(states_visible_units,axis = 1))[newaxis].T
_lambda = _lambda * N
_lambda_power = zeros((len(_lambda),len(_lambda[0])))
for (i,j),value in ndenumerate(_lambda):
_lambda_power[i,j] = power(value,states_visible_units[i,j])
return (exp(-_lambda)*(_lambda_power))/sc.factorial(states_visible_units,exact=False)
def bernoulli(x):
return 1./(1+exp(-x))