frommicrograd.engineimportValuea=Value(-4.0)# create a "Value" objectb=Value(2.0)c=a+b# transform a and b into c with "+"c+=c+1c+=1+c+(-a)print(c.data)# prints -1.0c.backward()print(a.grad)# prints 3.0print(b.grad)# prints 4.0
h=0.00001# This is the point (a, b, c) # for which we want the derivative of da=2.0b=-3.0c=10.0d1=a*b+c# function value at (a, b, c)a+=h# bump up a by hd2=a*b+c# function value at (a+h, b, c)a-=h# restore ab+=h# bump up b by hd3=a*b+c# function value at (a, b+h, c)b-=h# restore bc+=h# bump up c by hd4=a*b+c# function value at (a, b, c+h)print('Function value for (a,b,c) d1:\t',d1)print()print('Function value for (a+h,b,c) d2:',d2)# How much the function increased from bumping up aprint('slope',(d2-d1)/h)print('\nFunction value for (a,b+h,c) d3:',d3)# How much the function increased from bumping up bprint('slope',(d3-d1)/h)print('\nFunction value of (a,b,c+h) d4:\t',d4)# How much the function increased from bumping up cprint('slope',(d4-d1)/h)
Function value for (a,b,c) d1: 4.0
Function value for (a+h,b,c) d2: 3.9999699999999994
slope -3.000000000064062
Function value for (a,b+h,c) d3: 4.00002
slope 2.0000000000131024
Function value of (a,b,c+h) d4: 4.00001
slope 0.9999999999621422
classValue:# Object initializationdef__init__(self,data):self.data=data# Tells how to print this object nicely def__repr__(self):returnf"Value(data={self.data})"# Addition, a+b == a.__add__(b)def__add__(self,other):out=Value(self.data+other.data)returnout# Multiplicationdef__mul__(self,other):out=Value(self.data*other.data)returnouta=Value(2.0)b=Value(-3.0)c=Value(10.0)d=a*b+c# this really is: a.__mul__(b).__add__(c)print(d)# Value(data=4.0)
classValue:# This got extended to take in _childrendef__init__(self,data,_children=()):self.data=dataself._prev=set(_children)def__repr__(self):returnf"Value(data={self.data})"# Addition, a+b == a.__add__(b)def__add__(self,other):# We initialize the result's _children to be self and otherout=Value(self.data+other.data,(self,other))returnout# Multiplicationdef__mul__(self,other):# We initialize the result's _children to be self and otherout=Value(self.data*other.data,(self,other))returnouta=Value(2.0)b=Value(-3.0)c=Value(10.0)d=a*b+cd# Value(data=4.0)d._prev# {Value(data=-6.0), Value(data=10.0)}
fromgraphvizimportDigraph# Enumerates all the nodes and edges -> builds a set for themdeftrace(root):# builds a set of all nodes and edges in a graphnodes,edges=set(),set()defbuild(v):ifvnotinnodes:nodes.add(v)forchildinv._prev:edges.add((child,v))build(child)build(root)returnnodes,edges# Draw the graphdefdraw_dot(root):dot=Digraph(format='svg',graph_attr={'rankdir':'LR'})# LR = left to rightnodes,edges=trace(root)forninnodes:uid=str(id(n))# for any value in the graph, create a rectangular ('record') node for itdot.node(name=uid,label="{ %s | data %.4f }"%(n.label,n.data),shape='record')ifn._op:# if this value is a result of some operation, create an op node for itdot.node(name=uid+n._op,label=n._op)# and connect this node to itdot.edge(uid+n._op,uid)forn1,n2inedges:# connect n1 to the op node of n2dot.edge(str(id(n1)),str(id(n2))+n2._op)returndotdraw_dot(L)
deftrace(root):# builds a set of all nodes and edges in a graphnodes,edges=set(),set()defbuild(v):ifvnotinnodes:nodes.add(v)forchildinv._prev:edges.add((child,v))build(child)build(root)returnnodes,edges# Draw the graphdefdraw_dot(root):dot=Digraph(format='svg',graph_attr={'rankdir':'LR'})# LR = left to rightnodes,edges=trace(root)forninnodes:uid=str(id(n))# for any value in the graph, create a rectangular ('record') node for itdot.node(name=uid,label="{ %s | data %.4f | grad %.4f }"%(n.label,n.data,n.grad),shape='record')ifn._op:# if this value is a result of some operation, create an op node for itdot.node(name=uid+n._op,label=n._op)# and connect this node to itdot.edge(uid+n._op,uid)forn1,n2inedges:# connect n1 to the op node of n2dot.edge(str(id(n1)),str(id(n2))+n2._op)returndotdraw_dot(L)
# This is just always the caseL.grad=1.0# As L = d * f is given we're certain that dL/dd = fd.grad=f.data# and therefore this is also truef.grad=d.data
# Moving leaf nodes in gradient direction# This is gradient ascent (not descent)a.data+=0.01*a.gradb.data+=0.01*b.gradc.data+=0.01*c.gradf.data+=0.01*f.grad# Forward passe=a*bd=e+cL=d*fprint(L.data)# L increased
defsigmoid(x):a=[]foriinx:a.append(1/(1+math.exp(-i)))returnafig,(ax1,ax2)=plt.subplots(1,2)fig.suptitle('Tanh vs. Sigmoid')lower=-5upper=5step=0.2ax1.grid()ax1.set_title('Tanh')ax1.plot(np.arange(lower,upper,step),np.tanh(np.arange(-5,5,0.2)))# Tanh activation functionax2.grid()ax2.set_title('Sigmoid')ax2.plot(np.arange(lower,upper,step),sigmoid(np.arange(-5,5,0.2)))# Sigmoid activation function (alternative)plt.show()
importnumpyasnpdeftanh(x):returnnp.tanh(x)# Input and weights for the single neuronx=np.array([0.5,-0.3])# For simplicity, we'll use a vector of inputs here.weights=np.array([-0.1,0.2])# Bias (b) is set to 0 for this simple examplebias=0# Implement the neuron's output using the tanh activation functionneuron_output=tanh(np.dot(weights,x)+bias)print("Neuron Output:",neuron_output)
importnumpyasnpdeftanh(x):returnnp.tanh(x)# Input and weights for the single neuronx=np.array([0.5,-0.3])# For simplicity, we'll use a vector of inputs here.weights=np.array([-0.1,0.2])# Bias (b) is set to 0 for this simple examplebias=0# Implement the neuron's output using the tanh activation functionneuron_output=tanh(np.dot(weights,x)+bias)print("Neuron Output:",neuron_output)
# Always a giveno.grad=1.0# o = tanh(n), what is do/dn?# do/dn = 1 - tanh(n)**2n.grad=1-o.data**2# As addition "just" splits the gradientx1w1x2w2.grad=n.gradb.grad=n.grad# And addition againx1w1.grad=x1w1x2w2.gradx2w2.grad=x1w1x2w2.grad# And multiplication handles like in the example abovex1.grad=x1w1.grad*w1.dataw1.grad=x1w1.grad*x1.datax2.grad=x2w2.grad*w2.dataw2.grad=x2w2.grad*x2.data# this will be 0: changing this value does nothing, as its multiplied by 0
classValue:def__init__(self,data,_children=(),_op='',label=''):self.data=dataself.grad=0.0self._backward=lambda:None# Does nothing by defaultself._prev=set(_children)self._op=_opself.label=labeldef__repr__(self):returnf"Value(data={self.data})"# Addition, a+b == a.__add__(b)def__add__(self,other):out=Value(self.data+other.data,(self,other),'+')def_backward():# Route gradient to parentsself.grad=1.0*out.gradother.grad=1.0*out.gradout._backward=_backwardreturnout# Multiplicationdef__mul__(self,other):out=Value(self.data*other.data,(self,other),'*')def_backward():# Route gradient affected by data of other nodeself.grad=out.grad*other.dataother.grad=out.grad*self.dataout._backward=_backwardreturnout# Tanh activation functiondeftanh(self):x=self.datat=(math.exp(2*x)-1)/(math.exp(2*x)+1)out=Value(t,(self,),'tanh')def_backward():# Local derivative times gradient of child nodeself.grad=(1-t**2)*out.gradout._backward=_backwardreturnouta=Value(2.0,label='a')b=Value(-3.0,label='b')c=Value(10.0,label='c')e=a*b;e.label='e'd=e+c;d.label='d'f=Value(-2.0,label='f')L=d*f;L.label='L'print(L)# Value(data=-8.0)print(L._prev)# {Value(data=-2.0), Value(data=4.0)}print(L._op)# *
# Inputsx1=Value(2.0,label='x1')x2=Value(0.0,label='x2')# Weightsw1=Value(-3.0,label='w1')w2=Value(1.0,label='w2')# Biasb=Value(6.8813735870195432,label='b')# Making sure backprop numbers come out nice later on# Neuron value n: x1w1+x2w2 + bx1w1=x1*w1;x1w1.label='x1*w1'x2w2=x2*w2;x2w2.label='x2*w2'x1w1x2w2=x1w1+x2w2;x1w1x2w2.label='x1*w1 + x2*w2'n=x1w1x2w2+b;n.label='n'# Squashed activation: tanh(n)o=n.tanh();o.label='o'draw_dot(o)
o.grad=1.0# Base case for backprop multiplication to worko._backward()n._backward()b._backward()# Nothing happens, as this is a leafx1w1x2w2._backward()x2w2._backward()x1w1._backward()
# Topological sort# Building the graph topologicallytopo=[]visited=set()defbuild_topo(v):ifvnotinvisited:visited.add(v)forchildinv._prev:build_topo(child)topo.append(v)# Only add node if all preceeding nodes were processed firstbuild_topo(o)fortintopo:print(t)
o.grad=1.0topo=[]visited=set()defbuild_topo(v):ifvnotinvisited:visited.add(v)forchildinv._prev:build_topo(child)topo.append(v)# Only add node if all nodes were processed firstbuild_topo(o)fornodeinreversed(topo):node._backward()draw_dot(o)
classValue:def__init__(self,data,_children=(),_op='',label=''):self.data=dataself.grad=0.0self._backward=lambda:None# Does nothing by defaultself._prev=set(_children)self._op=_opself.label=labeldef__repr__(self):returnf"Value(data={self.data})"# Additiondef__add__(self,other):out=Value(self.data+other.data,(self,other),'+')def_backward():self.grad=1.0*out.gradother.grad=1.0*out.gradout._backward=_backwardreturnout# Multiplicationdef__mul__(self,other):out=Value(self.data*other.data,(self,other),'*')def_backward():self.grad=out.grad*other.dataother.grad=out.grad*self.dataout._backward=_backwardreturnout# Tanh activation functiondeftanh(self):x=self.datat=(math.exp(2*x)-1)/(math.exp(2*x)+1)out=Value(t,(self,),'tanh')def_backward():self.grad=(1-t**2)*out.gradout._backward=_backwardreturnoutdefbackward(self):topo=[]visited=set()defbuild_topo(v):ifvnotinvisited:visited.add(v)forchildinv._prev:build_topo(child)topo.append(v)# Only add node if all nodes were processed firstbuild_topo(self)self.grad=1.0fornodeinreversed(topo):node._backward()
classValue:def__init__(self,data,_children=(),_op='',label=''):self.data=dataself.grad=0.0self._backward=lambda:None# Does nothing by defaultself._prev=set(_children)self._op=_opself.label=labeldef__repr__(self):returnf"Value(data={self.data})"# Additiondef__add__(self,other):other=otherifisinstance(other,Value)elseValue(other)# Extensionout=Value(self.data+other.data,(self,other),'+')def_backward():self.grad+=1.0*out.grad# Bugfixother.grad+=1.0*out.grad# Bugfixout._backward=_backwardreturnout# Negation (special multiplication)def__neg__(self):# -selfreturn-1*self# Subtraction (special addition)def__sub__(self,other):# self - otherreturnself+(-other)# Multiplicationdef__mul__(self,other):other=otherifisinstance(other,Value)elseValue(other)# Extensionout=Value(self.data*other.data,(self,other),'*')def_backward():self.grad+=out.grad*other.data# Bugfixother.grad+=out.grad*self.data# Bugfixout._backward=_backwardreturnout# Power (special multiplication)def__pow__(self,other):assertisinstance(other,(int,float)),"only supporting in/float powers (for now)"out=Value(self.data**other,(self,),f'**{other}')def_backward():self.grad+=other*(self.data**(other-1))*out.gradout._backward=_backwardreturnout# Called if self is on right side of *def__rmul__(self,other):# other * selfreturnself*other# Called if self is on right side of +def__radd__(self,other):# other + selfreturnself+other# True division (special multiplication)def__truediv__(self,other):# self / otherreturnself*other**-1# Tanh activation functiondeftanh(self):x=self.datat=(math.exp(2*x)-1)/(math.exp(2*x)+1)out=Value(t,(self,),'tanh')def_backward():self.grad+=(1-t**2)*out.grad# Bugfixout._backward=_backwardreturnout# Exponential functiondefexp(self):x=self.dataout=Value(math.exp(x),(self,),'exp')def_backward():self.grad+=out.data*out.gradout._backward=_backwardreturnoutdefbackward(self):topo=[]visited=set()defbuild_topo(v):ifvnotinvisited:visited.add(v)forchildinv._prev:build_topo(child)topo.append(v)# Only add node if all nodes were processed firstbuild_topo(self)self.grad=1.0# Seed gradient always 1.0fornodeinreversed(topo):node._backward()
# Just a sanity check for the newly implemented arithmeticsa=Value(2.0)b=Value(4.0)print(a+2)print(2+a)print(a*2)print(2*a)print(-a)print(a-b)print(a.exp())print(a/b)# Division: a/b = a * (1/b) = a * (b**(-1)), so we use a function realizing x**k
x1=torch.Tensor([2.0]).double();x1.requires_grad=True# single element tensorsx2=torch.Tensor([0.0]).double();x2.requires_grad=True# tensor datatype is now doublew1=torch.Tensor([-3.0]).double();w1.requires_grad=True# default dtype was float32w2=torch.Tensor([1.0]).double();w2.requires_grad=True# now its float64 aka doubleb=torch.Tensor([6.8813735870195432]).double();b.requires_grad=Truen=x1*w1+x2*w2+b# perform arithmetic just like with microgrado=torch.tanh(n)print(o.data.item())o.backward()# backward() is pytorch's autograd functionprint('---')# These values below are just like micrograds left most layerprint('x2',x2.grad.item())print('w2',w2.grad.item())print('x1',x1.grad.item())print('w1',w1.grad.item())
# One neuron is able to take multiple inputs and produce one activation scalarclassNeuron:def__init__(self,nin):# nin -> number of inputs to this neuron# Random weight [-1, 1] per inputself.w=[Value(np.random.uniform(-1,1))for_inrange(nin)]# Bias controls general "trigger happiness" of neuronself.b=Value(np.random.uniform(-1,1))def__call__(self,x):# running neuron(x) -> __call__ triggered# w * x + b# zip() creates iterator running over the tupels of two iterators# self.b is taken as the sum's start value and then added uponact=sum((wi*xiforwi,xiinzip(self.w,x)),self.b)# Squash the activation with tanhout=act.tanh()returnout# Convenience code to gather the neuron's parameter listdefparameters(self):returnself.w+[self.b]# A set of neurons making up a (hidden/input/output) NN layer# E.g. n = Layer(2, 3) -> 3 2-dimensional neuronsclassLayer:# nout -> how many neurons/outputs should be in this layer# nin -> how many inputs are to be expected per neurondef__init__(self,nin,nout):# literally create a list of neurons as neededself.neurons=[Neuron(nin)for_inrange(nout)]def__call__(self,x):# running layer(x) -> __call__ triggered# return all of the layer's neuron activationsouts=[n(x)forninself.neurons]returnouts[0]iflen(outs)==1elseouts# Convenience code to gather all parameters of layer's neuronsdefparameters(self):return[pforneuroninself.neuronsforpinneuron.parameters()]# MLP -> Multi-layer perceptron -> NNclassMLP:# nin -> number of inputs to the NN# nouts -> list of numbers, defines sizes of all wanted layersdef__init__(self,nin,nouts):sz=[nin]+noutsself.layers=[Layer(sz[i],sz[i+1])foriinrange(len(nouts))]def__call__(self,x):# mlp(x) -> call all layer(x)s values in NNforlayerinself.layers:# Neat forward pass implementationx=layer(x)returnx# Convenience code to gather all parameters of all layer's neuronsdefparameters(self):return[pforlayerinself.layersforpinlayer.parameters()]
# Features/Inputsxs=[[2.0,3.0,-1.0],[3.0,-1.0,0.5],[0.5,1.0,1.0],[1.0,1.0,-1.0],]# Desired targetsys=[1.0,-1.0,-1.0,1.0]# Get the NN's current prediction for xsypred=[n(x)forxinxs]foriinrange(len(ypred)):print(f'{ypred[i]}\t--> {ys[i]}')
# Example neuron weight with now calculated gradientprint(n.layers[0].neurons[0].w[0].grad)# First layer's first neuron's first weight's gradientprint(n.layers[0].neurons[0].w[0].data)# First layer's first neuron's first weight's value
# Weight Update with Backpropagation's gradientsforpinn.parameters():p.data+=-0.01*p.grad# Move a tiny bit in opposite direction of gradient to not overfit this single example
# Run epochs and show respective predictionsfortinrange(5):ypred=[n(x)forxinxs]loss=sum((yout-ygt)**2forygt,youtinzip(ys,ypred))print(f'{loss.data}\t - {[y.dataforyinypred]}')loss.backward()forpinn.parameters():p.data+=-0.01*p.grad