import numpy as np
from collections import defaultdict

 import compgraph as cg
from compgraph.nodes import *

 def add_grad(prev_adjoint, node):
    return [prev_adjoint, prev_adjoint]

 def mul_grad(prev_adjoint, node):
    return [
        prev_adjoint * node.operand_b,
        prev_adjoint * node.operand_a
    ]

 def dot_grad(prev_adjoint, node):
    
    prev_adj = prev_adjoint
    op_a = node.operand_a
    op_b = node.operand_b

    
    if node.operand_b.ndim == 1:
        prev_adj = cg.reshape(prev_adjoint, (-1,1))
        op_b = cg.reshape(op_b, (-1, 1))

    if node.operand_a.ndim == 1:
        prev_adj = cg.reshape(prev_adjoint, (1,-1))
        op_a = cg.reshape(op_a, (1, -1))

    adj_op_a = cg.dot(prev_adj, op_b.T)
    adj_op_b = cg.dot(op_a.T, prev_adj)
    
    if node.operand_a.ndim == 1:
        adj_op_a = adj_op_a.squeeze()
        
    if node.operand_b.ndim == 1:
        adj_op_b = adj_op_b.squeeze()

    return [adj_op_a,adj_op_b]

 def max_grad(prev_adjoint, node):
    doperand_a = cg.where(node.operand_a == node.with_keepdims, 1, 0)
    normalizers = cg.sum(doperand_a, axis=node.axis, keepdims=True)
    normalized_doperand_a = doperand_a / normalizers

    if node.axis is not None:
        return [np.expand_dims(prev_adjoint, node.axis) * normalized_doperand_a, None]
    else:
        return [prev_adjoint * normalized_doperand_a, None]

 import autodiff.grads as grads

op_grad = getattr(grads, '{}_grad'.format(current_op))

 def compute_backprop(node, verbose=False):
    """
    computes and returns the gradient of the given node wrt to VariableNodes
    the function implements a breadth-first-search (BFS) to traverse the
    computational graph from the gievn node back to VariableNodes

    Parameters:
    ----------
    node: Node
        the leaf node to compute its gradient
    """

    adjoint = defaultdict(int)
    grad = {} #to be computed for variable nodes:
    queue = NodesHeap()
    
    #Ensure that node is a float:
    assert node.size == 1, "We only allow to compute gradients from scalars."
    
    # put the given node in the queue and set its adjoint to one
    adjoint[node.name] = ConstantNode.create_using(np.ones(node.shape))
    queue.push(node)

    while len(queue) > 0:
        current_node = queue.pop()
        
        if verbose:
            print("Popped node: ", current_node.name)

        if isinstance(current_node, ConstantNode):
            continue
        if isinstance(current_node, VariableNode):
            grad[current_node.name] = adjoint[current_node.name]
            continue

        current_adjoint = adjoint[current_node.name]
        current_op = current_node.opname
        
        #Get gradient function:
        op_grad = getattr(grads, '{}_grad'.format(current_op))
        
        #Compute next adjoints:
        next_adjoints = op_grad(current_adjoint, current_node)

        #Add next_adjoint to old adjoint:
        adjoint[current_node.operand_a.name] = adjoint[current_node.operand_a.name] + next_adjoints[0]
        
        #Add new node to queue if not already inside the queue:
        if current_node.operand_a not in queue:
            queue.push(current_node.operand_a)

        #Do the same for operand_b is exists:
        if current_node.operand_b is not None:
            adjoint[current_node.operand_b.name] = adjoint[current_node.operand_b.name] + next_adjoints[1]

            if current_node.operand_b not in queue:
                queue.push(current_node.operand_b)

    return grad

 import itertools
def check_gradient(fx, x_input, suspect):
    """
    checks the correctness of the suspect derivative value against
    the value of the numerical approximation of the derivative

    Parameters:
    ----------
    fx: callable
        The function to check its derivative
    wrt: int
        0-based index of the variable to differntiate with respect to
    args: list
        the values of the function variables at the derivative point
    suspect: float
        the the suspected value of the derivative to check
    """
    h = 1.e-7
    approx_grad = np.zeros_like(x_input)
    fx_input = fx(x_input)
    
    for el in itertools.product(*[list(range(i)) for i in x_input.shape], repeat=1):
        x_input_shifted = x_input.copy()
        x_input_shifted[el] = x_input_shifted[el] + h
        approx_grad[el] = (fx(x_input_shifted) - fx_input) / h

    return approx_grad

 cg.reset()
var_node = cg.VariableNode.create_using([[1,2,4],[2.0,4.0,5.0]])
const_node = cg.ConstantNode.create_using([[8,1,3],[4.0,2.0,4.0]])

def function_1(x_input):
    power_x = x_input**const_node
    x_sum = x_input + const_node
    x_cos = cg.cos(x_sum)
    labels = cg.ConstantNode.create_using(np.zeros_like(x_cos))
    labels[:,0] = 1
    softmax = cg.softmax_cross_entropy(x_cos,labels)
    return softmax

output = function_1(var_node)
cg.build_and_visualize_graph(output)
computed_gradient = compute_backprop(output)['_0']
approx_grad = check_gradient(function_1,var_node,computed_gradient)
assert np.allclose(approx_grad,computed_gradient)

 cg.reset()
var_node = cg.VariableNode.create_using([[[1,-4,4],[1.0,4.0,5.0]],[[12,-34,44],[-2,-4,6.0]]])
const_node = cg.ConstantNode.create_using([[[-1,4,4],[2.0,-3.0,5.0]],[[12,-3,4],[-4,-4,2.0]]])
mat_node = cg.ConstantNode.create_using([[4,-1],[2,3]])

def function_2(x_input):
    x_sum = x_input - const_node
    x_cos = cg.cos(x_sum)
    x_sin = cg.sin(x_sum)
    x_reduced_1 = cg.sum(x_cos,axis=2)
    x_reduced_2 = cg.mean(x_sin,axis=2)
    prod_reduced = x_reduced_1*x_reduced_2
    max_prod = cg.max(prod_reduced,axis=1)
    output = cg.dot(mat_node,max_prod)
    return cg.sum(output)

output = function_2(var_node)
cg.build_and_visualize_graph(output)
computed_gradient = compute_backprop(output)['_0']
approx_grad = check_gradient(function_2,var_node,computed_gradient)
assert np.allclose(approx_grad,computed_gradient)

Build your own Pytorch - 2: Backpropagation¶

1. Mathematical Derivation of the backpropagation algorithm on computation graphs¶

2. Backpropagation with Breadth-First-Search¶

3. Implementing adjoints¶

3.1. $f$ = sum¶

3.2. $f$ = multiply¶

3.3. Matrix multiplication¶

3.4. Max operation¶

3.5. Packaging adjoints¶

4. Implementing Breadth-First-Search Backpropagation¶

5 Computing a few example gradients¶

2.1. Example A¶

5.2. Example B¶