- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
我正在尝试实现无梯度优化器函数,以使用 Flux.jl 与 Julia 一起训练卷积神经网络。引用论文是这样的:https://arxiv.org/abs/2005.05955 .本文提出了 RSO,一种无梯度优化算法,在采样基础上一次更新单个权重。该算法的伪代码如下图所示。
我正在使用 MNIST 数据集。
function train(; kws...)
args = Args(; kws...) # collect options in a stuct for convinience
if CUDA.functional() && args.use_cuda
@info "Training on CUDA GPU"
CUDA.allwoscalar(false)
device = gpu
else
@info "Training on CPU"
device = cpu
end
# Prepare datasets
x_train, x_test, y_train, y_test = getdata(args, device)
# Create DataLoaders (mini-batch iterators)
train_loader = DataLoader((x_train, y_train), batchsize=args.batchsize, shuffle=true)
test_loader = DataLoader((x_test, y_test), batchsize=args.batchsize)
# Construct model
model = build_model() |> device
ps = Flux.params(model) # model's trainable parameters
best_param = ps
if args.optimiser == "SGD"
# Regular training step with SGD
elseif args.optimiser == "RSO"
# Run RSO function and update ps
best_param .= RSO(x_train, y_train, args.RSOupdate, model, args.batchsize, device)
end
以及对应的RSO函数:
function RSO(X,L,C,model, batch_size, device)
"""
model = convolutional model structure
X = Input data
L = labels
C = Number of rounds to update parameters
W = Weight set of layers
Wd = Weight tensors of layer d that generates an activation
wid = weight tensor that generates an activation aᵢ
wj = a weight in wid
"""
# Normalize input data to have zero mean and unit standard deviation
X .= (X .- sum(X))./std(X)
train_loader = DataLoader((X, L), batchsize=batch_size, shuffle=true)
#println("model = $(typeof(model))")
std_prep = []
σ_d = Float64[]
D = 1
for layer in model
D += 1
Wd = Flux.params(layer)
# Initialize the weights of the network with Gaussian distribution
for id in Wd
wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), (3,3,4,4)))
id = wj
append!(std_prep, vec(wj))
end
# Compute std of all elements in the weight tensor Wd
push!(σ_d, std(std_prep))
end
W = Flux.params(model)
# Weight update
for _ in 1:C
d = D
while d > 0
for id in 1:length(W[d])
# Randomly sample change in weights from Gaussian distribution
for j in 1:length(w[d][id])
# Randomly sample mini-batch
(x, l) = train_loader[rand(1:length(train_loader))]
# Sample a weight from normal distribution
ΔWj[d][id][j] = rand(Normal(0, σ_d[d]), 1)
loss, acc = loss_and_accuracy(data_loader, model, device)
W = argmin(F(x,l, W+ΔWj), F(x,l,W), F(x,l, W-ΔWj))
end
end
d -= 1
end
end
return W
end
这里的问题是RSO函数的第二 block 。我试图在三种情况下评估单个权重变化的损失,它们是 F(w, l, W+gW), F(w, l, W), F(w, l, W -gW)
,选择损失最小的权重集。但是我如何使用 Flux.jl 做到这一点?我尝试使用的损失函数是 logitcrossentropy(ŷ, y, agg=sum)
。为了生成 y_hat,我们应该使用 model(W),但是改变 Zygote.Params() 形式中的单个权重参数已经很具有挑战性了....
最佳答案
根据您分享的论文,您似乎需要更改每一层的每个输出神经元的权重数组。不幸的是,这意味着优化例程的实现将取决于层类型,因为卷积层的“输出神经元”与全连接层完全不同。换句话说,只是循环 Flux.params(model)
这还不够,因为这只是模型中所有权重数组的集合,并且每个权重数组根据它来自哪个层而被不同地对待。
幸运的是,如果您使用单独的函数而不是一个巨大的循环,Julia 的多重分派(dispatch)确实使这更容易编写。我将使用下面的伪代码总结该算法:
for layer in model
for output_neuron in layer
for weight_element in parameters(output_neuron)
weight_element = sample(N(0, sqrt(2 / num_outputs(layer))))
end
end
sigmas[layer] = stddev(parameters(layer))
end
for c in 1 to C
for layer in reverse(model)
for output_neuron in layer
for weight_element in parameters(output_neuron)
x, y = sample(batches)
dw = N(0, sigmas[layer])
# optimize weights
end
end
end
end
这是for output_neuron ...
我们需要隔离成单独函数的部分。
在第一个 block 中,我们实际上并没有对每个 weight_element
做任何不同的事情。 ,它们都是从相同的正态分布中采样的。所以,我们实际上不需要迭代输出神经元,但我们确实需要知道有多少。
using Statistics: std
# this function will set the weights according to the
# normal distribution and the number of output neurons
# it also returns the standard deviation of the weights
function sample_weight!(layer::Dense)
sample = randn(eltype(layer.weight), size(layer.weight))
num_outputs = size(layer.weight, 1)
# notice the "." notation which is used to mutate the array
layer.weight .= sample .* num_outputs
return std(layer.weight)
end
function sample_weight!(layer::Conv)
sample = randn(eltype(layer.weight), size(layer.weight))
num_outputs = size(layer.weight, 4)
# notice the "." notation which is used to mutate the array
layer.weight .= sample .* num_outputs
return std(layer.weight)
end
sigmas = map(sample_weights!, model)
现在,对于第二个 block ,我们将通过为每一层定义不同的函数来实现类似的技巧。
function optimize_layer!(loss, layer::Dense, data, sigma)
for i in 1:size(layer.weight, 1)
for j in 1:size(layer.weight, 2)
wj = layer.weight[i, j]
x, y = data[rand(1:length(data))]
dw = randn() * sigma
ws = [wj + dw, wj, wj - dw]
losses = Float32[]
for (k, w) in enumerate(ws)
layer.weight[i, j] = w
losses[k] = loss(x, y)
end
layer.weight[i, j] = ws[argmin(losses)]
end
end
end
function optimize_layer!(loss, layer::Conv, data, sigma)
for i in 1:size(layer.weight, 4)
# we use a view to reference the full kernel
# for this output channel
wid = view(layer.weight, :, :, :, i)
# each index let's us treat wid like a vector
for j in eachindex(wid)
wj = wid[j]
x, y = data[rand(1:length(data))]
dw = randn() * sigma
ws = [wj + dw, wj, wj - dw]
losses = Float32[]
for (k, w) in enumerate(ws)
wid[j] = w
losses[k] = loss(x, y)
end
wid[j] = ws[argmin(losses)]
end
end
end
for c in 1:C
for (layer, sigma) in reverse(zip(model, sigmas))
optimize_layer!(layer, data, sigma) do x, y
logitcrossentropy(model(x), y; agg = sum)
end
end
end
请注意,我没有在任何地方使用 Flux.params
这对我们没有帮助。另外,Flux.params
将同时包括权重和偏差,并且该论文看起来根本不在乎偏差。如果您有一种优化方法,可以对任何参数进行一般优化,而不管层类型是否相同(即像梯度下降),那么您可以使用 for p in Flux.params(model) ...
.
关于machine-learning - 助焊剂.jl : Customizing optimizer,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/70641453/
我是一名优秀的程序员,十分优秀!