Data Science Asked by Arka Patra on March 13, 2021
# Neural Network Architecture
no_hid_layers = 1
hid = 3
no_out = 1
# Xavier Ininitialization of weights w
w1 = np.random.randn(hid, n+1)*np.sqrt(2/(hid+n+1))
w2 = np.random.randn(no_out, hid+1)*np.sqrt(2/(no_out+hid+1))
# Sigmoid Activation Function
def g(x):
sig = 1/(1+np.exp(-x))
return sig
def frwrd_prop(X, w1, w2):
z2 = w1 @ X.T
z2 = norm(z2, axis=0)
a2 = np.insert(g(z2), 0, 1, axis=0)
h = g((w2@a2))
return (h,a2)
# Calculating Cost and Gradient
def Cost(X, y, w1, w2, lmbda=0):
# Initializing Cost J and Gradients dw
J = 0
dw1 = np.zeros(w1.shape)
dw2 = np.zeros(w2.shape)
# Forward Propagation to calculate the value of the output
h, a2 = frwrd_prop(X, w1, w2)
# Calculate the Cost Function J
J = -(np.sum(y.T*np.log(h) + (1-y).T*np.log(1-h)) - lmbda/2*(np.sum(np.sum(w1[:,1:].T@w1[:,1:])) + np.sum(w2[:,1:].T@w2[:,1:])))/m
# Applying Back Propagation to calculate the Gradients dw
D3 = h-y
D2 = (w2.T@D3)*a2*(1-a2)
dw1[:,0] = (D2[1:]@X)[:,0]/m
dw2[:,0] = ([email protected])[:,0]/m
dw1[:, 1:] = ((D2[1:]@X)[:,1:] + lmbda*w1[:,1:])/m
dw2[:, 1:] = (([email protected])[:,1:] + lmbda*w2[:,1:])/m
# Gradient clipping
if(abs(np.linalg.norm(dw1))>4.5):
dw1 = dw1*4.5/(np.linalg.norm(dw1))
if(abs(np.linalg.norm(dw2))>4.5):
dw1 = dw1*4.5/(np.linalg.norm(dw2))
return (J, dw1, dw2)
# Adam's Optimization technique for training w
def Train(w1, w2, maxIter=50):
# Algorithm
a, b1, b2, e = 0.001, 0.9, 0.999, 10**(-8)
V1 = np.zeros(w1.shape)
V2 = np.zeros(w2.shape)
S1 = np.zeros(w1.shape)
S2 = np.zeros(w2.shape)
for i in range(maxIter):
J, dw1, dw2 = Cost(X, y, w1, w2)
V1 = b1*V1 + (1-b1)*dw1
S1 = b2*S1 + (1-b2)*(dw1**2)
V2 = b1*V2 + (1-b1)*dw2
S2 = b2*S2 + (1-b2)*(dw2**2)
if i!=0:
V1 = V1/(1-b1**i)
S1 = S1/(1-b2**i)
V2 = V2/(1-b1**i)
S2 = S2/(1-b2**i)
w1 = w1 - a*V1/(np.sqrt(S1)+e)*dw1
w2 = w2 - a*V2/(np.sqrt(S2)+e)*dw2
print("tttIteration : ", i+1, " tCost : ", J)
return (w1, w2)
# Training Neural Network
w1, w2 = Train(w1,w2)
I’m using Adam’s Optimization to converge Gradient Descent to a global minima but the cost is becoming stagnant (not changing) after around 15 iterations(the number is not fixed). The initial cost due to random initialization of weights is changing very minutely before becoming constant. And this is giving training accuracy from 45% to 70% for different runs of the exact same code. Can you help me with the reason behind this?
All stochastic gradient descent (SGD) optimizers, including Adam, have randomization built and have no guarantees of reaching a global minima. The randomization is a result of training on a sub-sample of the data at each step. There are no guarantees of reaching a global minima because gradient descent optimizers are first-order, iterative optimization techniques.
Answered by Brian Spiering on March 13, 2021
Get help from others!
Recent Answers
Recent Questions
© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP