官方示例:
import torch from torch.autograd import Variable class MyReLU(torch.autograd.Function): """ We can implement our own custom autograd Functions by subclassing torch.autograd.Function and implementing the forward and backward passes which operate on Tensors. """ @staticmethod def forward(ctx, input): """ In the forward pass we receive a Tensor containing the input and return a Tensor containing the output. ctx is a context object that can be used to stash information for backward computation. You can cache arbitrary objects for use in the backward pass using the ctx.save_for_backward method. """ ctx.save_for_backward(input) # ctx 用来保存反向求导所需要的数据,也就是可以在backward()函数中使用的变量。 return input.clamp(min=0) @staticmethod def backward(ctx, grad_output): """ In the backward pass we receive a Tensor containing the gradient of the loss with respect to the output, and we need to compute the gradient of the loss with respect to the input. """ input, = ctx.saved_tensors grad_input = grad_output.clone() grad_input[input < 0] = 0 return grad_input #反向传播求梯度,如果该参数为网络需要更新的参数,那么该梯度会被保存,方便之后的参数更新或者优化。 dtype = torch.FloatTensor # dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold input and outputs, and wrap them in Variables. x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False) y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False) # Create random Tensors for weights, and wrap them in Variables. w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True) w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True) learning_rate = 1e-6 for t in range(500): # To apply our Function, we use Function.apply method. We alias this as 'relu'. relu = MyReLU.apply # Forward pass: compute predicted y using operations on Variables; we compute # ReLU using our custom autograd operation. y_pred = relu(x.mm(w1)).mm(w2) # Compute and print loss loss = (y_pred - y).pow(2).sum() print(t, loss.item()) # Use autograd to compute the backward pass. loss.backward() # 反向传播会将可训练参数梯度保存。 # Update weights using gradient descent w1.data -= learning_rate * w1.grad.data w2.data -= learning_rate * w2.grad.data # Manually zero the gradients after updating weights w1.grad.data.zero_()#梯度清零。 w2.grad.data.zero_()
类继承:torch.autograd.Function
,只需要定义正向传播和反向传播函数,正向传播就是自己定义函数的计算方法;反向传播则是求导梯度,ctx
这个东西就当做self
来对待就行,可以用来存储反向求导要求的数据,比如正向传播的结果或者输入。
import torch from torch.autograd import Function import warnings warnings.filterwarnings("ignore") class LinearFunction1(Function): """ 描述:在pytorch中自定义一个操作,并定义它的梯度求法""" @staticmethod def forward(ctx, input, weight, bias=None): ctx.save_for_backward(input, weight, bias) # shape: n,m, m nout # ctx.needs_input_grad = (False,True,True) output = torch.mm(input, weight) # n,m; m,c_out if bias is not None: output += bias.unsqueeze(0).expand_as(output) # output += torch.unsqueeze(bias,dim=0).expand_as(output) # output += bias #广播。 # ctx.save_for_backward(output) return output @staticmethod def backward(ctx, grad_outputs): input, weight, bias = ctx.saved_tensors grad_input = None grad_weight = None grad_bias = None if ctx.needs_input_grad[0]: grad_input = grad_outputs @ (weight.t()) # n,c_out;c_out,m if ctx.needs_input_grad[1]: grad_weight = input.t() @ grad_outputs # m,n n,c_out if bias is not None and ctx.needs_input_grad[2]: grad_bias = grad_outputs.sum(0) return grad_input,grad_weight,grad_bias # Inherit from Function class LinearFunction(Function): # Note that both forward and backward are @staticmethods @staticmethod # bias is an optional argument def forward(ctx, input, weight, bias=None): ctx.save_for_backward(input, weight, bias) output = input.mm(weight.t()) # 20,20; 30,20 -> 20,30 if bias is not None: output += bias.unsqueeze(0).expand_as(output) return output # This function has only a single output, so it gets only one gradient @staticmethod def backward(ctx, grad_output): # This is a pattern that is very convenient - at the top of backward # unpack saved_tensors and initialize all gradients w.r.t. inputs to # None. Thanks to the fact that additional trailing Nones are # ignored, the return statement is simple even when the function has # optional inputs. input, weight, bias = ctx.saved_tensors grad_input = grad_weight = grad_bias = None # These needs_input_grad checks are optional and there only to # improve efficiency. If you want to make your code simpler, you can # skip them. Returning gradients for inputs that don't require it is # not an error. if ctx.needs_input_grad[0]: grad_input = grad_output.mm(weight) # 20 30 , 30 20 -> 20 20 或者 20 30 30 20 if ctx.needs_input_grad[1]: grad_weight = grad_output.t().mm(input) # 30 20, 20 20 - > 30 20 if bias is not None and ctx.needs_input_grad[2]: grad_bias = grad_output.sum(0) return grad_input, grad_weight, grad_bias
也就是定义正向传播和反向传播并保存需要的数据到context中即可,函数内的数学运算可以不是pytorch支持的运算而只需要是python支持支持的即可(个人理解是这样)。
测试操作是否正确:
from torch.autograd import gradcheck linear = LinearFunction.apply #这里使用上边的为什么不行,去个别名。 input = ( torch.randn(size=(20,20), dtype=torch.double, requires_grad=True),torch.randn(30,20,dtype=torch.double,requires_grad=True)) test = gradcheck(linear, input, eps=1e-6, atol=1e-4) print(test) linear = LinearFunction1.apply #这里使用上边的为什么不行,去个别名。 input = (torch.randn(20,20,dtype=torch.double,requires_grad=True), torch.randn(20,30,dtype=torch.double,requires_grad=True)) test = gradcheck(linear, input, eps=1e-6, atol=1e-4) print(test)
可以看到两种方都返回的是True
,需要注意的是自己定义的操作输入的形状等问题而已。
import torch.nn as nn class Linear(nn.Module): def __init__(self, input_features, output_features, bias=True): super(Linear,self).__init__() self.input_features = input_features self.output_features = output_features self.weight = nn.Parameter(torch.randn(input_features, output_features)) if bias: self.bias = nn.Parameter(torch.randn(output_features)) else: self.register_parameter("bias", None) # self.weight.uniform(-0.1, 0.1) nn.init.kaiming_uniform(self.weight) if bias: nn.init.kaiming_uniform(self.bias) def forward(self,x): return LinearFunction1(x, self.weight, self.bias) # 调用自定义的操作。
在自定义操作的基础上建立的layer
就与其他layer
一样都可以自动求导和优化参数了。
上边的线性变换是的运算是可导的情况,也就是可以从输出一步步的用导数或者运算来表达,如果遇到那种不可导的情况,也就是无法显式的表达导数该怎么办?那就是自己制定导数求法,比如近似求导或者干脆用另一个黑盒函数来进行代替求导。那么既然是黑盒函数,那么反向的传播的时候函数中间的值的梯度什么的就很难进行计算了,这种问题就需要对backward
函数进行稍微的改变:
"""当某个操作是不可导的,但是你却用了近似的方法来代替。""" from torch.autograd.function import once_differentiable def un_differentibale_function(grad_output): "一些列不可导的神奇操作" grad_output_changed = None return grad_output_changed @staticmethod @once_differentiable def backward(ctx, grad_output): print(type(grad_output)) grad_output_changed = un_differentibale_function(grad_output) grad_input = grad_output_changed return grad_input
@once_differentiable
神马意思,我也说不清,就当做隐式求导或者近似求导吧。