@@ -1259,6 +1259,7 @@ IF(ENABLE_TORCH MATCHES "ON") | |||
ADD_SUBDIRECTORY(contrib/torch/paths) | |||
ADD_SUBDIRECTORY(contrib/torch/torch7) | |||
ADD_SUBDIRECTORY(contrib/torch/nn) | |||
ADD_SUBDIRECTORY(contrib/torch/optim) | |||
ADD_SUBDIRECTORY(contrib/torch/decisiontree) | |||
SET(WITH_TORCH 1) | |||
ELSE() |
@@ -0,0 +1,5 @@ | |||
CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) | |||
SET(src) | |||
FILE(GLOB luasrc *.lua) | |||
ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}") |
@@ -0,0 +1,35 @@ | |||
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) | |||
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) | |||
Copyright (c) 2011-2013 NYU (Clement Farabet) | |||
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) | |||
Copyright (c) 2006 Idiap Research Institute (Samy Bengio) | |||
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in the | |||
documentation and/or other materials provided with the distribution. | |||
3. Neither the names of NEC Laboratories American and IDIAP Research | |||
Institute nor the names of its contributors may be used to endorse or | |||
promote products derived from this software without specific prior | |||
written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
POSSIBILITY OF SUCH DAMAGE. |
@@ -0,0 +1,361 @@ | |||
--[[ A Confusion Matrix class | |||
Example: | |||
conf = optim.ConfusionMatrix( {'cat','dog','person'} ) -- new matrix | |||
conf:zero() -- reset matrix | |||
for i = 1,N do | |||
conf:add( neuralnet:forward(sample), label ) -- accumulate errors | |||
end | |||
print(conf) -- print matrix | |||
image.display(conf:render()) -- render matrix | |||
]] | |||
local ConfusionMatrix = torch.class('optim.ConfusionMatrix') | |||
function ConfusionMatrix:__init(nclasses, classes) | |||
if type(nclasses) == 'table' then | |||
classes = nclasses | |||
nclasses = #classes | |||
end | |||
self.mat = torch.LongTensor(nclasses,nclasses):zero() | |||
self.valids = torch.FloatTensor(nclasses):zero() | |||
self.unionvalids = torch.FloatTensor(nclasses):zero() | |||
self.nclasses = nclasses | |||
self.totalValid = 0 | |||
self.averageValid = 0 | |||
self.classes = classes or {} | |||
-- buffers | |||
self._mat_flat = self.mat:view(-1) | |||
self._target = torch.FloatTensor() | |||
self._prediction = torch.FloatTensor() | |||
self._max = torch.FloatTensor() | |||
self._pred_idx = torch.LongTensor() | |||
self._targ_idx = torch.LongTensor() | |||
end | |||
-- takes scalar prediction and target as input | |||
function ConfusionMatrix:_add(p, t) | |||
assert(p and type(p) == 'number') | |||
assert(t and type(t) == 'number') | |||
-- non-positive values are considered missing | |||
-- and therefore ignored | |||
if t > 0 then | |||
self.mat[t][p] = self.mat[t][p] + 1 | |||
end | |||
end | |||
function ConfusionMatrix:add(prediction, target) | |||
if type(prediction) == 'number' then | |||
-- comparing numbers | |||
self:_add(prediction, target) | |||
else | |||
self._prediction:resize(prediction:size()):copy(prediction) | |||
assert(prediction:dim() == 1) | |||
if type(target) == 'number' then | |||
-- prediction is a vector, then target assumed to be an index | |||
self._max:max(self._pred_idx, self._prediction, 1) | |||
self:_add(self._pred_idx[1], target) | |||
else | |||
-- both prediction and target are vectors | |||
assert(target:dim() == 1) | |||
self._target:resize(target:size()):copy(target) | |||
self._max:max(self._targ_idx, self._target, 1) | |||
self._max:max(self._pred_idx, self._prediction, 1) | |||
self:_add(self._pred_idx[1], self._targ_idx[1]) | |||
end | |||
end | |||
end | |||
function ConfusionMatrix:batchAdd(predictions, targets) | |||
local preds, targs, __ | |||
self._prediction:resize(predictions:size()):copy(predictions) | |||
if predictions:dim() == 1 then | |||
-- predictions is a vector of classes | |||
preds = self._prediction | |||
elseif predictions:dim() == 2 then | |||
-- prediction is a matrix of class likelihoods | |||
if predictions:size(2) == 1 then | |||
-- or prediction just needs flattening | |||
preds = self._prediction:select(2,1) | |||
else | |||
self._max:max(self._pred_idx, self._prediction, 2) | |||
preds = self._pred_idx:select(2,1) | |||
end | |||
else | |||
error("predictions has invalid number of dimensions") | |||
end | |||
self._target:resize(targets:size()):copy(targets) | |||
if targets:dim() == 1 then | |||
-- targets is a vector of classes | |||
targs = self._target | |||
elseif targets:dim() == 2 then | |||
-- targets is a matrix of one-hot rows | |||
if targets:size(2) == 1 then | |||
-- or targets just needs flattening | |||
targs = self._target:select(2,1) | |||
else | |||
self._max:max(self._targ_idx, self._target, 2) | |||
targs = self._targ_idx:select(2,1) | |||
end | |||
else | |||
error("targets has invalid number of dimensions") | |||
end | |||
-- non-positive values are considered missing and therefore ignored | |||
local mask = targs:ge(1) | |||
targs = targs[mask] | |||
preds = preds[mask] | |||
self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility | |||
preds = preds:typeAs(targs) | |||
assert(self.mat:isContiguous() and self.mat:stride(2) == 1) | |||
local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat) | |||
local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1)) | |||
self._mat_flat:indexAdd(1, indices, ones) | |||
end | |||
function ConfusionMatrix:zero() | |||
self.mat:zero() | |||
self.valids:zero() | |||
self.unionvalids:zero() | |||
self.totalValid = 0 | |||
self.averageValid = 0 | |||
end | |||
local function isNaN(number) | |||
return number ~= number | |||
end | |||
function ConfusionMatrix:updateValids() | |||
local total = 0 | |||
for t = 1,self.nclasses do | |||
self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum() | |||
self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t]) | |||
total = total + self.mat[t][t] | |||
end | |||
self.totalValid = total / self.mat:sum() | |||
self.averageValid = 0 | |||
self.averageUnionValid = 0 | |||
local nvalids = 0 | |||
local nunionvalids = 0 | |||
for t = 1,self.nclasses do | |||
if not isNaN(self.valids[t]) then | |||
self.averageValid = self.averageValid + self.valids[t] | |||
nvalids = nvalids + 1 | |||
end | |||
if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then | |||
self.averageUnionValid = self.averageUnionValid + self.unionvalids[t] | |||
nunionvalids = nunionvalids + 1 | |||
end | |||
end | |||
self.averageValid = self.averageValid / nvalids | |||
self.averageUnionValid = self.averageUnionValid / nunionvalids | |||
end | |||
-- Calculating FAR/FRR associated with the confusion matrix | |||
function ConfusionMatrix:farFrr() | |||
local cmat = self.mat | |||
local noOfClasses = cmat:size()[1] | |||
self._frrs = self._frrs or torch.zeros(noOfClasses) | |||
self._frrs:zero() | |||
self._classFrrs = self._classFrrs or torch.zeros(noOfClasses) | |||
self._classFrrs:zero() | |||
self._classFrrs:add(-1) | |||
self._fars = self._fars or torch.zeros(noOfClasses) | |||
self._fars:zero() | |||
self._classFars = self._classFars or torch.zeros(noOfClasses) | |||
self._classFars:zero() | |||
self._classFars:add(-1) | |||
local classSamplesCount = cmat:sum(2) | |||
local indx = 1 | |||
for i=1,noOfClasses do | |||
if classSamplesCount[i][1] ~= 0 then | |||
self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1] | |||
self._classFrrs[i] = self._frrs[indx] | |||
-- Calculating FARs | |||
local farNumerator = 0 | |||
local farDenominator = 0 | |||
for j=1, noOfClasses do | |||
if i ~= j then | |||
if classSamplesCount[j][1] ~= 0 then | |||
farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1] | |||
farDenominator = farDenominator + 1 | |||
end | |||
end | |||
end | |||
self._fars[indx] = farNumerator/farDenominator | |||
self._classFars[i] = self._fars[indx] | |||
indx = indx + 1 | |||
end | |||
end | |||
indx = indx - 1 | |||
local returnFrrs = self._frrs[{{1, indx}}] | |||
local returnFars = self._fars[{{1, indx}}] | |||
return self._classFrrs, self._classFars, returnFrrs, returnFars | |||
end | |||
local function log10(n) | |||
if math.log10 then | |||
return math.log10(n) | |||
else | |||
return math.log(n) / math.log(10) | |||
end | |||
end | |||
function ConfusionMatrix:__tostring__() | |||
self:updateValids() | |||
local str = {'ConfusionMatrix:\n'} | |||
local nclasses = self.nclasses | |||
table.insert(str, '[') | |||
local maxCnt = self.mat:max() | |||
local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt))) | |||
for t = 1,nclasses do | |||
local pclass = self.valids[t] * 100 | |||
pclass = string.format('%2.3f', pclass) | |||
if t == 1 then | |||
table.insert(str, '[') | |||
else | |||
table.insert(str, ' [') | |||
end | |||
for p = 1,nclasses do | |||
table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p])) | |||
end | |||
if self.classes and self.classes[1] then | |||
if t == nclasses then | |||
table.insert(str, ']] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') | |||
else | |||
table.insert(str, '] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') | |||
end | |||
else | |||
if t == nclasses then | |||
table.insert(str, ']] ' .. pclass .. '% \n') | |||
else | |||
table.insert(str, '] ' .. pclass .. '% \n') | |||
end | |||
end | |||
end | |||
table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n') | |||
table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n') | |||
table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%') | |||
return table.concat(str) | |||
end | |||
function ConfusionMatrix:render(sortmode, display, block, legendwidth) | |||
-- args | |||
local confusion = self.mat:double() | |||
local classes = self.classes | |||
local sortmode = sortmode or 'score' -- 'score' or 'occurrence' | |||
local block = block or 25 | |||
local legendwidth = legendwidth or 200 | |||
local display = display or false | |||
-- legends | |||
local legend = { | |||
['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]', | |||
['occurrence'] = 'Confusion matrix [sorted by occurrences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]' | |||
} | |||
-- parse matrix / normalize / count scores | |||
local diag = torch.FloatTensor(#classes) | |||
local freqs = torch.FloatTensor(#classes) | |||
local unconf = confusion | |||
local confusion = confusion:clone() | |||
local corrects = 0 | |||
local total = 0 | |||
for target = 1,#classes do | |||
freqs[target] = confusion[target]:sum() | |||
corrects = corrects + confusion[target][target] | |||
total = total + freqs[target] | |||
confusion[target]:div( math.max(confusion[target]:sum(),1) ) | |||
diag[target] = confusion[target][target] | |||
end | |||
-- accuracies | |||
local accuracy = corrects / total * 100 | |||
local perclass = 0 | |||
local total = 0 | |||
for target = 1,#classes do | |||
if confusion[target]:sum() > 0 then | |||
perclass = perclass + diag[target] | |||
total = total + 1 | |||
end | |||
end | |||
perclass = perclass / total * 100 | |||
freqs:div(unconf:sum()) | |||
-- sort matrix | |||
if sortmode == 'score' then | |||
_,order = torch.sort(diag,1,true) | |||
elseif sortmode == 'occurrence' then | |||
_,order = torch.sort(freqs,1,true) | |||
else | |||
error('sort mode must be one of: score | occurrence') | |||
end | |||
-- render matrix | |||
local render = torch.zeros(#classes*block, #classes*block) | |||
for target = 1,#classes do | |||
for prediction = 1,#classes do | |||
render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]] | |||
end | |||
end | |||
-- add grid | |||
for target = 1,#classes do | |||
render[{ {target*block},{} }] = 0.1 | |||
render[{ {},{target*block} }] = 0.1 | |||
end | |||
-- create rendering | |||
require 'image' | |||
require 'qtwidget' | |||
require 'qttorch' | |||
local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] ) | |||
image.display{image=render, win=win1} | |||
-- add legend | |||
for i in ipairs(classes) do | |||
-- background cell | |||
win1:setcolor{r=0,g=0,b=0} | |||
win1:rectangle((#render)[2],(i-1)*block,legendwidth,block) | |||
win1:fill() | |||
-- % | |||
win1:setfont(qt.QFont{serif=false, size=fontsize}) | |||
local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4 | |||
win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2} | |||
win1:moveto((#render)[2]+10,i*block-block/3) | |||
win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100)) | |||
-- legend | |||
win1:setfont(qt.QFont{serif=false, size=fontsize}) | |||
local gscale = diag[order[i]]*0.8+0.2 | |||
win1:setcolor{r=gscale,g=gscale,b=gscale} | |||
win1:moveto(120+(#render)[2]+10,i*block-block/3) | |||
win1:show(classes[order[i]]) | |||
for j in ipairs(classes) do | |||
-- scores | |||
local score = confusion[order[j]][order[i]] | |||
local gscale = (1-score)*(score*0.8+0.2) | |||
win1:setcolor{r=gscale,g=gscale,b=gscale} | |||
win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3) | |||
win1:show(string.format('%02.0f',math.floor(score*100+0.5))) | |||
end | |||
end | |||
-- generate tensor | |||
local t = win1:image():toTensor() | |||
-- display | |||
if display then | |||
image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)} | |||
end | |||
-- return rendering | |||
return t | |||
end |
@@ -0,0 +1,190 @@ | |||
--[[ Logger: a simple class to log symbols during training, | |||
and automate plot generation | |||
Example: | |||
logger = optim.Logger('somefile.log') -- file to save stuff | |||
for i = 1,N do -- log some symbols during | |||
train_error = ... -- training/testing | |||
test_error = ... | |||
logger:add{['training error'] = train_error, | |||
['test error'] = test_error} | |||
end | |||
logger:style{['training error'] = '-', -- define styles for plots | |||
['test error'] = '-'} | |||
logger:plot() -- and plot | |||
---- OR --- | |||
logger = optim.Logger('somefile.log') -- file to save stuff | |||
logger:setNames{'training error', 'test error'} | |||
for i = 1,N do -- log some symbols during | |||
train_error = ... -- training/testing | |||
test_error = ... | |||
logger:add{train_error, test_error} | |||
end | |||
logger:style{'-', '-'} -- define styles for plots | |||
logger:plot() -- and plot | |||
----------- | |||
logger:setlogscale(true) -- enable logscale on Y-axis | |||
logger:plot() -- and plot | |||
]] | |||
require 'xlua' | |||
local Logger = torch.class('optim.Logger') | |||
function Logger:__init(filename, timestamp) | |||
if filename then | |||
self.name = filename | |||
os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"') | |||
if timestamp then | |||
-- append timestamp to create unique log file | |||
filename = filename .. '-'..os.date("%Y_%m_%d_%X") | |||
end | |||
self.file = io.open(filename,'w') | |||
self.epsfile = self.name .. '.eps' | |||
else | |||
self.file = io.stdout | |||
self.name = 'stdout' | |||
print('<Logger> warning: no path provided, logging to std out') | |||
end | |||
self.empty = true | |||
self.symbols = {} | |||
self.styles = {} | |||
self.names = {} | |||
self.idx = {} | |||
self.figure = nil | |||
self.showPlot = true | |||
self.plotRawCmd = nil | |||
self.defaultStyle = '+' | |||
self.logscale = false | |||
end | |||
function Logger:setNames(names) | |||
self.names = names | |||
self.empty = false | |||
self.nsymbols = #names | |||
for k,key in pairs(names) do | |||
self.file:write(key .. '\t') | |||
self.symbols[k] = {} | |||
self.styles[k] = {self.defaultStyle} | |||
self.idx[key] = k | |||
end | |||
self.file:write('\n') | |||
self.file:flush() | |||
return self | |||
end | |||
function Logger:add(symbols) | |||
-- (1) first time ? print symbols' names on first row | |||
if self.empty then | |||
self.empty = false | |||
self.nsymbols = #symbols | |||
for k,val in pairs(symbols) do | |||
self.file:write(k .. '\t') | |||
self.symbols[k] = {} | |||
self.styles[k] = {self.defaultStyle} | |||
self.names[k] = k | |||
end | |||
self.idx = self.names | |||
self.file:write('\n') | |||
end | |||
-- (2) print all symbols on one row | |||
for k,val in pairs(symbols) do | |||
if type(val) == 'number' then | |||
self.file:write(string.format('%11.4e',val) .. '\t') | |||
elseif type(val) == 'string' then | |||
self.file:write(val .. '\t') | |||
else | |||
xlua.error('can only log numbers and strings', 'Logger') | |||
end | |||
end | |||
self.file:write('\n') | |||
self.file:flush() | |||
-- (3) save symbols in internal table | |||
for k,val in pairs(symbols) do | |||
table.insert(self.symbols[k], val) | |||
end | |||
end | |||
function Logger:style(symbols) | |||
for name,style in pairs(symbols) do | |||
if type(style) == 'string' then | |||
self.styles[name] = {style} | |||
elseif type(style) == 'table' then | |||
self.styles[name] = style | |||
else | |||
xlua.error('style should be a string or a table of strings','Logger') | |||
end | |||
end | |||
return self | |||
end | |||
function Logger:setlogscale(state) | |||
self.logscale = state | |||
end | |||
function Logger:display(state) | |||
self.showPlot = state | |||
end | |||
function Logger:plot(...) | |||
if not xlua.require('gnuplot') then | |||
if not self.warned then | |||
print('<Logger> warning: cannot plot with this version of Torch') | |||
self.warned = true | |||
end | |||
return | |||
end | |||
local plotit = false | |||
local plots = {} | |||
local plotsymbol = | |||
function(name,list) | |||
if #list > 1 then | |||
local nelts = #list | |||
local plot_y = torch.Tensor(nelts) | |||
for i = 1,nelts do | |||
plot_y[i] = list[i] | |||
end | |||
for _,style in ipairs(self.styles[name]) do | |||
table.insert(plots, {self.names[name], plot_y, style}) | |||
end | |||
plotit = true | |||
end | |||
end | |||
local args = {...} | |||
if not args[1] then -- plot all symbols | |||
for name,list in pairs(self.symbols) do | |||
plotsymbol(name,list) | |||
end | |||
else -- plot given symbols | |||
for _,name in ipairs(args) do | |||
plotsymbol(self.idx[name], self.symbols[self.idx[name]]) | |||
end | |||
end | |||
if plotit then | |||
if self.showPlot then | |||
self.figure = gnuplot.figure(self.figure) | |||
if self.logscale then gnuplot.logscale('on') end | |||
gnuplot.plot(plots) | |||
if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end | |||
gnuplot.grid('on') | |||
gnuplot.title('<Logger::' .. self.name .. '>') | |||
end | |||
if self.epsfile then | |||
os.execute('rm -f "' .. self.epsfile .. '"') | |||
local epsfig = gnuplot.epsfigure(self.epsfile) | |||
if self.logscale then gnuplot.logscale('on') end | |||
gnuplot.plot(plots) | |||
if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end | |||
gnuplot.grid('on') | |||
gnuplot.title('<Logger::' .. self.name .. '>') | |||
gnuplot.plotflush() | |||
gnuplot.close(epsfig) | |||
end | |||
end | |||
end |
@@ -0,0 +1,55 @@ | |||
--[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701 | |||
ARGS: | |||
- `opfunc` : a function that takes a single input (X), the point of | |||
evaluation, and returns f(X) and df/dX | |||
- `x` : the initial point | |||
- `config` : a table of hyper-parameters | |||
- `config.rho` : interpolation parameter | |||
- `config.eps` : for numerical stability | |||
- `config.weightDecay` : weight decay | |||
- `state` : a table describing the state of the optimizer; after each | |||
call the state is modified | |||
- `state.paramVariance` : vector of temporal variances of parameters | |||
- `state.accDelta` : vector of accummulated delta of gradients | |||
RETURN: | |||
- `x` : the new x vector | |||
- `f(x)` : the function, evaluated before the update | |||
]] | |||
function optim.adadelta(opfunc, x, config, state) | |||
-- (0) get/update state | |||
if config == nil and state == nil then | |||
print('no state table, ADADELTA initializing') | |||
end | |||
local config = config or {} | |||
local state = state or config | |||
local rho = config.rho or 0.9 | |||
local eps = config.eps or 1e-6 | |||
local wd = config.weightDecay or 0 | |||
state.evalCounter = state.evalCounter or 0 | |||
-- (1) evaluate f(x) and df/dx | |||
local fx,dfdx = opfunc(x) | |||
-- (2) weight decay | |||
if wd ~= 0 then | |||
dfdx:add(wd, x) | |||
end | |||
-- (3) parameter update | |||
if not state.paramVariance then | |||
state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() | |||
state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() | |||
state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() | |||
state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() | |||
end | |||
state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx) | |||
state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt() | |||
state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx) | |||
x:add(-1, state.delta) | |||
state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta) | |||
-- (4) update evaluation counter | |||
state.evalCounter = state.evalCounter + 1 | |||
-- return x*, f(x) before optimization | |||
return x,{fx} | |||
end |
@@ -0,0 +1,55 @@ | |||
--[[ ADAGRAD implementation for SGD | |||
ARGS: | |||
- `opfunc` : a function that takes a single input (X), the point of | |||
evaluation, and returns f(X) and df/dX | |||
- `x` : the initial point | |||
- `state` : a table describing the state of the optimizer; after each | |||
call the state is modified | |||
- `state.learningRate` : learning rate | |||
- `state.paramVariance` : vector of temporal variances of parameters | |||
- `state.weightDecay` : scalar that controls weight decay | |||
RETURN: | |||
- `x` : the new x vector | |||
- `f(x)` : the function, evaluated before the update | |||
]] | |||
function optim.adagrad(opfunc, x, config, state) | |||
-- (0) get/update state | |||
if config == nil and state == nil then | |||
print('no state table, ADAGRAD initializing') | |||
end | |||
local config = config or {} | |||
local state = state or config | |||
local lr = config.learningRate or 1e-3 | |||
local lrd = config.learningRateDecay or 0 | |||
local wd = config.weightDecay or 0 | |||
state.evalCounter = state.evalCounter or 0 | |||
local nevals = state.evalCounter | |||
-- (1) evaluate f(x) and df/dx | |||
local fx,dfdx = opfunc(x) | |||
-- (2) weight decay with a single parameter | |||
if wd ~= 0 then | |||
dfdx:add(wd, x) | |||
end | |||
-- (3) learning rate decay (annealing) | |||
local clr = lr / (1 + nevals*lrd) | |||
-- (4) parameter update with single or individual learning rates | |||
if not state.paramVariance then | |||
state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() | |||
state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx) | |||
end | |||
state.paramVariance:addcmul(1,dfdx,dfdx) | |||
state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt() | |||
x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10)) | |||
-- (5) update evaluation counter | |||
state.evalCounter = state.evalCounter + 1 | |||
-- return x*, f(x) before optimization | |||
return x,{fx} | |||
end |
@@ -0,0 +1,72 @@ | |||
--[[ An implementation of Adam https://arxiv.org/abs/1412.6980 | |||
ARGS: | |||
- 'opfunc' : a function that takes a single input (X), the point | |||
of a evaluation, and returns f(X) and df/dX | |||
- 'x' : the initial point | |||
- 'config` : a table with configuration parameters for the optimizer | |||
- 'config.learningRate' : learning rate | |||
- `config.learningRateDecay` : learning rate decay | |||
- 'config.beta1' : first moment coefficient | |||
- 'config.beta2' : second moment coefficient | |||
- 'config.epsilon' : for numerical stability | |||
- 'config.weightDecay' : weight decay | |||
- 'state' : a table describing the state of the optimizer; after each | |||
call the state is modified | |||
RETURN: | |||
- `x` : the new x vector | |||
- `f(x)` : the function, evaluated before the update | |||
]] | |||
function optim.adam(opfunc, x, config, state) | |||
-- (0) get/update state | |||
local config = config or {} | |||
local state = state or config | |||
local lr = config.learningRate or 0.001 | |||
local lrd = config.learningRateDecay or 0 | |||
local beta1 = config.beta1 or 0.9 | |||
local beta2 = config.beta2 or 0.999 | |||
local epsilon = config.epsilon or 1e-8 | |||
local wd = config.weightDecay or 0 | |||
-- (1) evaluate f(x) and df/dx | |||
local fx, dfdx = opfunc(x) | |||
-- (2) weight decay | |||
if wd ~= 0 then | |||
dfdx:add(wd, x) | |||
end | |||
-- Initialization | |||
state.t = state.t or 0 | |||
-- Exponential moving average of gradient values | |||
state.m = state.m or x.new(dfdx:size()):zero() | |||
-- Exponential moving average of squared gradient values | |||
state.v = state.v or x.new(dfdx:size()):zero() | |||
-- A tmp tensor to hold the sqrt(v) + epsilon | |||
state.denom = state.denom or x.new(dfdx:size()):zero() | |||
-- (3) learning rate decay (annealing) | |||
local clr = lr / (1 + state.t*lrd) | |||
state.t = state.t + 1 | |||
-- Decay the first and second moment running average coefficient | |||
state.m:mul(beta1):add(1-beta1, dfdx) | |||
state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx) | |||
state.denom:copy(state.v):sqrt():add(epsilon) | |||
local biasCorrection1 = 1 - beta1^state.t | |||
local biasCorrection2 = 1 - beta2^state.t | |||
local stepSize = clr * math.sqrt(biasCorrection2)/biasCorrection1 | |||
-- (4) update x | |||
x:addcdiv(-stepSize, state.m, state.denom) | |||
-- return x*, f(x) before optimization | |||
return x, {fx} | |||
end |
@@ -0,0 +1,66 @@ | |||
--[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf | |||
ARGS: | |||
- 'opfunc' : a function that takes a single input (X), the point | |||
of a evaluation, and returns f(X) and df/dX | |||
- 'x' : the initial point | |||
- 'config` : a table with configuration parameters for the optimizer | |||
- 'config.learningRate' : learning rate | |||
- 'config.beta1' : first moment coefficient | |||
- 'config.beta2' : second moment coefficient | |||
- 'config.epsilon' : for numerical stability | |||
- 'state' : a table describing the state of the optimizer; | |||
after each call the state is modified. | |||
RETURN: | |||
- `x` : the new x vector | |||
- `f(x)` : the function, evaluated before the update | |||
]] | |||
function optim.adamax(opfunc, x, config, state) | |||
-- (0) get/update state | |||
local config = config or {} | |||
local state = state or config | |||
local lr = config.learningRate or 0.002 | |||
local beta1 = config.beta1 or 0.9 | |||
local beta2 = config.beta2 or 0.999 | |||
local epsilon = config.epsilon or 1e-38 | |||
local wd = config.weightDecay or 0 | |||
-- (1) evaluate f(x) and df/dx | |||
local fx, dfdx = opfunc(x) | |||
-- (2) weight decay | |||
if wd ~= 0 then | |||
dfdx:add(wd, x) | |||
end | |||
-- Initialization | |||
state.t = state.t or 0 | |||
-- Exponential moving average of gradient values | |||
state.m = state.m or x.new(dfdx:size()):zero() | |||
-- Exponential moving average of the infinity norm | |||
state.u = state.u or x.new(dfdx:size()):zero() | |||
-- A tmp tensor to hold the input to max() | |||
state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero() | |||
state.t = state.t + 1 | |||
-- Update biased first moment estimate. | |||
state.m:mul(beta1):add(1-beta1, dfdx) | |||
-- Update the exponentially weighted infinity norm. | |||
state.max[1]:copy(state.u):mul(beta2) | |||
state.max[2]:copy(dfdx):abs():add(epsilon) | |||
state.u:max(state.max, 1) | |||
local biasCorrection1 = 1 - beta1^state.t | |||
local stepSize = lr/biasCorrection1 | |||
-- (2) update x | |||
x:addcdiv(-stepSize, state.m, state.u) | |||
-- return x*, f(x) before optimization | |||
return x, {fx} | |||
end |
@@ -0,0 +1,73 @@ | |||
--[[ An implementation of ASGD | |||
ASGD: | |||
x := (1 - lambda eta_t) x - eta_t df/dx(z,x) | |||
a := a + mu_t [ x - a ] | |||
eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75 | |||
mu_t = 1/max(1,t-t0) | |||
implements ASGD algoritm as in L.Bottou's sgd-2.0 | |||
ARGS: | |||
- `opfunc` : a function that takes a single input (X), the point of | |||
evaluation, and returns f(X) and df/dX | |||
- `x` : the initial point | |||
- `state` : a table describing the state of the optimizer; after each | |||
call the state is modified | |||
- `state.eta0` : learning rate | |||
- `state.lambda` : decay term | |||
- `state.alpha` : power for eta update | |||
- `state.t0` : point at which to start averaging | |||
RETURN: | |||
- `x` : the new x vector | |||
- `f(x)` : the function, evaluated before the update | |||
- `ax` : the averaged x vector | |||
(Clement Farabet, 2012) | |||
--]] | |||
function optim.asgd(opfunc, x, config, state) | |||
-- (0) get/update state | |||
local config = config or {} | |||
local state = state or config | |||
config.eta0 = config.eta0 or 1e-4 | |||
config.lambda = config.lambda or 1e-4 | |||
config.alpha = config.alpha or 0.75 | |||
config.t0 = config.t0 or 1e6 | |||
-- (hidden state) | |||
state.eta_t = state.eta_t or config.eta0 | |||
state.mu_t = state.mu_t or 1 | |||
state.t = state.t or 0 | |||
-- (1) evaluate f(x) and df/dx | |||
local fx,dfdx = opfunc(x) | |||
-- (2) decay term | |||
x:mul(1 - config.lambda*state.eta_t) | |||
-- (3) update x | |||
x:add(-state.eta_t, dfdx) | |||
-- (4) averaging | |||
state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero() | |||
state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax) | |||
if state.mu_t ~= 1 then | |||
state.tmp:copy(x) | |||
state.tmp:add(-1,state.ax):mul(state.mu_t) | |||
state.ax:add(state.tmp) | |||
else | |||
state.ax:copy(x) | |||
end | |||
-- (5) update eta_t and mu_t | |||
state.t = state.t + 1 | |||
state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha) | |||
state.mu_t = 1 / math.max(1, state.t - config.t0) | |||
-- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...) | |||
return x,{fx},state.ax | |||
end |
@@ -0,0 +1,208 @@ | |||
--[[ | |||
This cg implementation is a rewrite of minimize.m written by Carl | |||
E. Rasmussen. It is supposed to produce exactly same results (give | |||
or take numerical accuracy due to some changed order of | |||
operations). You can compare the result on rosenbrock with minimize.m. | |||
http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html | |||
[x fx c] = minimize([0 0]', 'rosenbrock', -25) | |||
Note that we limit the number of function evaluations only, it seems much | |||
more important in practical use. | |||
ARGS: | |||
- `opfunc` : a function that takes a single input, the point of evaluation. | |||
- `x` : the initial point | |||
- `state` : a table of parameters and temporary allocations. | |||
- `state.maxEval` : max number of function evaluations | |||
- `state.maxIter` : max number of iterations | |||
- `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage | |||
- `state.[s,x0]` : if you pass torch.Tensor they will be used for temp storage | |||
RETURN: | |||
- `x*` : the new x vector, at the optimal point | |||
- `f` : a table of all function values where | |||
`f[1]` is the value of the function before any optimization and | |||
`f[#f]` is the final fully optimized value, at x* | |||
(Koray Kavukcuoglu, 2012) | |||
--]] | |||
function optim.cg(opfunc, x, config, state) | |||
-- parameters | |||
local config = config or {} | |||
local state = state or config | |||
local rho = config.rho or 0.01 | |||
local sig = config.sig or 0.5 | |||
local int = config.int or 0.1 | |||
local ext = config.ext or 3.0 | |||
local maxIter = config.maxIter or 20 | |||
local ratio = config.ratio or 100 | |||
local maxEval = config.maxEval or maxIter*1.25 | |||
local red = 1 | |||
local verbose = config.verbose or 0 | |||
local i = 0 | |||
local ls_failed = 0 | |||
local fx = {} | |||
-- we need three points for the interpolation/extrapolation stuff | |||
local z1,z2,z3 = 0,0,0 | |||
local d1,d2,d3 = 0,0,0 | |||
local f1,f2,f3 = 0,0,0 | |||
local df1 = state.df1 or x.new() | |||
local df2 = state.df2 or x.new() | |||
local df3 = state.df3 or x.new() | |||
local tdf | |||
df1:resizeAs(x) | |||
df2:resizeAs(x) | |||
df3:resizeAs(x) | |||
-- search direction | |||
local s = state.s or x.new() | |||
s:resizeAs(x) | |||
-- we need a temp storage for X | |||
local x0 = state.x0 or x.new() | |||
local f0 = 0 | |||
local df0 = state.df0 or x.new() | |||
x0:resizeAs(x) | |||
df0:resizeAs(x) | |||
-- evaluate at initial point | |||
f1,tdf = opfunc(x) | |||
fx[#fx+1] = f1 | |||
df1:copy(tdf) | |||
i=i+1 | |||
-- initial search direction | |||
s:copy(df1):mul(-1) | |||
d1 = -s:dot(s ) -- slope | |||
z1 = red/(1-d1) -- initial step | |||
while i < math.abs(maxEval) do | |||
x0:copy(x) | |||
f0 = f1 | |||
df0:copy(df1) | |||
x:add(z1,s) | |||
f2,tdf = opfunc(x) | |||
df2:copy(tdf) | |||
i=i+1 | |||
d2 = df2:dot(s) | |||
f3,d3,z3 = f1,d1,-z1 -- init point 3 equal to point 1 | |||
local m = math.min(maxIter,maxEval-i) | |||
local success = 0 | |||
local limit = -1 | |||
while true do | |||
while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do | |||
limit = z1 | |||
if f2 > f1 then | |||
z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3) | |||
else | |||
local A = 6*(f2-f3)/z3+3*(d2+d3) | |||
local B = 3*(f3-f2)-z3*(d3+2*d2) | |||
z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A | |||
end | |||
if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then | |||
z2 = z3/2; | |||
end | |||
z2 = math.max(math.min(z2, int*z3),(1-int)*z3); | |||
z1 = z1 + z2; | |||
x:add(z2,s) | |||
f2,tdf = opfunc(x) | |||
df2:copy(tdf) | |||
i=i+1 | |||
m = m - 1 | |||
d2 = df2:dot(s) | |||
z3 = z3-z2; | |||
end | |||
if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then | |||
break | |||
elseif d2 > sig*d1 then | |||
success = 1; | |||
break; | |||
elseif m == 0 then | |||
break; | |||
end | |||
local A = 6*(f2-f3)/z3+3*(d2+d3); | |||
local B = 3*(f3-f2)-z3*(d3+2*d2); | |||
z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3)) | |||
if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then | |||
if limit < -0.5 then | |||
z2 = z1 * (ext -1) | |||
else | |||
z2 = (limit-z1)/2 | |||
end | |||
elseif (limit > -0.5) and (z2+z1) > limit then | |||
z2 = (limit-z1)/2 | |||
elseif limit < -0.5 and (z2+z1) > z1*ext then | |||
z2 = z1*(ext-1) | |||
elseif z2 < -z3*int then | |||
z2 = -z3*int | |||
elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then | |||
z2 = (limit-z1)*(1-int) | |||
end | |||
f3=f2; d3=d2; z3=-z2; | |||
z1 = z1+z2; | |||
x:add(z2,s) | |||
f2,tdf = opfunc(x) | |||
df2:copy(tdf) | |||
i=i+1 | |||
m = m - 1 | |||
d2 = df2:dot(s) | |||
end | |||
if success == 1 then | |||
f1 = f2 | |||
fx[#fx+1] = f1; | |||
local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1) | |||
s:mul(ss) | |||
s:add(-1,df2) | |||
local tmp = df1:clone() | |||
df1:copy(df2) | |||
df2:copy(tmp) | |||
d2 = df1:dot(s) | |||
if d2> 0 then | |||
s:copy(df1) | |||
s:mul(-1) | |||
d2 = -s:dot(s) | |||
end | |||
z1 = z1 * math.min(ratio, d1/(d2-1e-320)) | |||
d1 = d2 | |||
ls_failed = 0 | |||
else | |||
x:copy(x0) | |||
f1 = f0 | |||
df1:copy(df0) | |||
if ls_failed or i>maxEval then | |||
break | |||
end | |||
local tmp = df1:clone() | |||
df1:copy(df2) | |||
df2:copy(tmp) | |||
s:copy(df1) | |||
s:mul(-1) | |||
d1 = -s:dot(s) | |||
z1 = 1/(1-d1) | |||
ls_failed = 1 | |||
end | |||
end | |||
state.df0 = df0 | |||
state.df1 = df1 | |||
state.df2 = df2 | |||
state.df3 = df3 | |||
state.x0 = x0 | |||
state.s = s | |||
return x,fx,i | |||
end |
@@ -0,0 +1,52 @@ | |||
--[[ An implementation of a simple numerical gradient checker. | |||
ARGS: | |||
- `opfunc` : a function that takes a single input (X), the point of | |||
evaluation, and returns f(X) and df/dX | |||
- `x` : the initial point | |||
- `eps` : the epsilon to use for the numerical check (default is 1e-7) | |||
RETURN: | |||
- `diff` : error in the gradient, should be near tol | |||
- `dC` : exact gradient at point | |||
- `dC_est` : numerically estimates gradient at point | |||
]]-- | |||
-- function that numerically checks gradient of NCA loss: | |||
function optim.checkgrad(opfunc, x, eps) | |||
-- compute true gradient: | |||
local Corg,dC = opfunc(x) | |||
dC:resize(x:size()) | |||
local Ctmp -- temporary value | |||
local isTensor = torch.isTensor(Corg) | |||
if isTensor then | |||
Ctmp = Corg.new(Corg:size()) | |||
end | |||
-- compute numeric approximations to gradient: | |||
local eps = eps or 1e-7 | |||
local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC) | |||
for i = 1,dC:size(1) do | |||
local tmp = x[i] | |||
x[i] = x[i] + eps | |||
local C1 = opfunc(x) | |||
if isTensor then | |||
Ctmp:copy(C1) | |||
C1 = Ctmp | |||
end | |||
x[i] = x[i] - 2 * eps | |||
local C2 = opfunc(x) | |||
x[i] = tmp | |||
dC_est[i] = (C1 - C2) / (2 * eps) | |||
end | |||
-- estimate error of gradient: | |||
local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est) | |||
return diff,dC,dC_est | |||
end |
@@ -0,0 +1,270 @@ | |||
require 'torch' | |||
require 'math' | |||
local BestSolution = {} | |||
--[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), | |||
ported from https://www.lri.fr/~hansen/barecmaes2.html. | |||
Parameters | |||
---------- | |||
ARGS: | |||
- `opfunc` : a function that takes a single input (X), the point of | |||
evaluation, and returns f(X) and df/dX. Note that df/dX is not used | |||
- `x` : the initial point | |||
- `state.sigma` | |||
float, initial step-size (standard deviation in each | |||
coordinate) | |||
- `state.maxEval` | |||
int, maximal number of function evaluations | |||
- `state.ftarget` | |||
float, target function value | |||
- `state.popsize` | |||
population size. If this is left empty, | |||
4 + int(3 * log(|x|)) will be used | |||
- `state.ftarget` | |||
stop if fitness < ftarget | |||
- `state.verb_disp` | |||
int, display on console every verb_disp iteration, 0 for never | |||
RETURN: | |||
- `x*` : the new `x` vector, at the optimal point | |||
- `f` : a table of all function values: | |||
`f[1]` is the value of the function before any optimization and | |||
`f[#f]` is the final fully optimized value, at `x*` | |||
--]] | |||
function optim.cmaes(opfunc, x, config, state) | |||
if (x.triu == nil or x.diag == nil) then | |||
error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x") | |||
end | |||
-- process input parameters | |||
local config = config or {} | |||
local state = state or config | |||
local xmean = x:clone():view(-1) -- distribution mean, a flattened copy | |||
local N = xmean:size(1) -- number of objective variables/problem dimension | |||
local sigma = state.sigma -- coordinate wise standard deviation (step size) | |||
local ftarget = state.ftarget -- stop if fitness < ftarget | |||
local maxEval = tonumber(state.maxEval) or 1e3*N^2 | |||
local objfunc = opfunc | |||
local verb_disp = state.verb_disp -- display step size | |||
local min_iterations = state.min_iterations or 1 | |||
local lambda = state.popsize -- population size, offspring number | |||
-- Strategy parameter setting: Selection | |||
if state.popsize == nil then | |||
lambda = 4 + math.floor(3 * math.log(N)) | |||
end | |||
local mu = lambda / 2 -- number of parents/points for recombination | |||
local weights = torch.range(0,mu-1):apply(function(i) | |||
return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights | |||
weights:div(weights:sum()) -- normalize recombination weights array | |||
local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i | |||
weights = weights:typeAs(x) | |||
-- Strategy parameter setting: Adaptation | |||
local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N) -- time constant for cumulation for C | |||
local cs = (mueff + 2) / (N + mueff + 5) -- t-const for cumulation for sigma control | |||
local c1 = 2 / ((N + 1.3)^2 + mueff) -- learning rate for rank-one update of C | |||
local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update | |||
local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1 | |||
-- Initialize dynamic (internal) state variables | |||
local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C | |||
local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma | |||
local B = torch.eye(N):typeAs(x) -- B defines the coordinate system | |||
local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling | |||
local C = torch.eye(N):typeAs(x) -- covariance matrix | |||
if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig | |||
error('torch.symeig not available for ' .. x:type() .. | |||
" please use Float- or DoubleTensor for x") | |||
end | |||
local candidates = torch.Tensor(lambda,N):typeAs(x) | |||
local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2 | |||
local eigeneval = 0 -- tracking the update of B and D | |||
local counteval = 0 | |||
local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination | |||
local fitvals = torch.Tensor(lambda)-- fitness values | |||
local best = BestSolution.new(nil,nil,counteval) | |||
local iteration = 0 -- iteration of the optimize loop | |||
local function ask() | |||
--[[return a list of lambda candidate solutions according to | |||
m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I) | |||
--]] | |||
-- Eigendecomposition: first update B, D and invsqrtC from C | |||
-- postpone in case to achieve O(N^2) | |||
if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then | |||
eigeneval = counteval | |||
C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry | |||
D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3) | |||
D = torch.sqrt(D) -- D contains standard deviations now | |||
invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t()) | |||
end | |||
for k=1,lambda do --repeat lambda times | |||
local z = D:clone():normal(0,1):cmul(D) | |||
candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma) | |||
end | |||
return candidates | |||
end | |||
local function tell(arx) | |||
--[[update the evolution paths and the distribution parameters m, | |||
sigma, and C within CMA-ES. | |||
Parameters | |||
---------- | |||
`arx` | |||
a list of solutions, presumably from `ask()` | |||
`fitvals` | |||
the corresponding objective function values --]] | |||
-- bookkeeping, preparation | |||
counteval = counteval + lambda -- slightly artificial to do here | |||
local xold = xmean:clone() | |||
-- Sort by fitness and compute weighted mean into xmean | |||
local arindex = nil --sorted indices | |||
fitvals, arindex = torch.sort(fitvals) | |||
arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions | |||
table.insert(f_hist, fitvals[1]) --append best fitness to history | |||
best:update(arx[1], fitvals[1], counteval) | |||
xmean:zero() | |||
xmean:addmv(arx:t(), weights) --dot product | |||
-- Cumulation: update evolution paths | |||
local y = xmean - xold | |||
local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold) | |||
local c = (cs * (2-cs) * mueff)^0.5 / sigma | |||
ps = ps - ps * cs + z * c -- exponential decay on ps | |||
local hsig = (torch.sum(torch.pow(ps,2)) / | |||
(1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1)) | |||
hsig = hsig and 1.0 or 0.0 --use binary numbers | |||
c = (cc * (2-cc) * mueff)^0.5 / sigma | |||
pc = pc - pc * cc + y * c * hsig -- exponential decay on pc | |||
-- Adapt covariance matrix C | |||
local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc) | |||
-- for a minor adjustment to the variance loss by hsig | |||
for i=1,N do | |||
for j=1,N do | |||
local r = torch.range(1,mu) | |||
r:apply(function(k) | |||
return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end) | |||
local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update | |||
C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] + | |||
c1 * pc[i]*pc[j] + cmu * Cmuij) | |||
end | |||
end | |||
-- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82 | |||
sigma = sigma * math.exp(math.min(0.6, | |||
(cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2)) | |||
end | |||
local function stop() | |||
--[[return satisfied termination conditions in a table like | |||
{'termination reason':value, ...}, for example {'tolfun':1e-12}, | |||
or the empty table {}--]] | |||
local res = {} | |||
if counteval > 0 then | |||
if counteval >= maxEval then | |||
res['evals'] = maxEval | |||
end | |||
if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then | |||
res['ftarget'] = ftarget | |||
end | |||
if torch.max(D) > 1e7 * torch.min(D) then | |||
res['condition'] = 1e7 | |||
end | |||
if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then | |||
res['tolfun'] = 1e-12 | |||
end | |||
if sigma * torch.max(D) < 1e-11 then | |||
-- remark: max(D) >= max(diag(C))^0.5 | |||
res['tolx'] = 1e-11 | |||
end | |||
end | |||
return res | |||
end | |||
local function disp(verb_modulo) | |||
--[[display some iteration info--]] | |||
if verb_disp == 0 then | |||
return nil | |||
end | |||
local iteration = counteval / lambda | |||
if iteration == 1 or iteration % (10*verb_modulo) == 0 then | |||
print('evals:\t ax-ratio max(std) f-value') | |||
end | |||
if iteration <= 2 or iteration % verb_modulo == 0 then | |||
local max_std = math.sqrt(torch.max(torch.diag(C))) | |||
print(tostring(counteval).. ': ' .. | |||
string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std) | |||
.. tostring(fitvals[1])) | |||
end | |||
return nil | |||
end | |||
while next(stop()) == nil or iteration < min_iterations do | |||
iteration = iteration + 1 | |||
local X = ask() -- deliver candidate solutions | |||
for i=1, lambda do | |||
-- put candidate tensor back in input shape and evaluate in opfunc | |||
local candidate = X[i]:viewAs(x) | |||
fitvals[i] = objfunc(candidate) | |||
end | |||
tell(X) | |||
disp(verb_disp) | |||
end | |||
local bestmu, f, c = best:get() | |||
if verb_disp > 0 then | |||
for k, v in pairs(stop()) do | |||
print('termination by', k, '=', v) | |||
end | |||
print('best f-value =', f) | |||
print('solution = ') | |||
print(bestmu) | |||
print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration) | |||
end | |||
table.insert(f_hist, f) | |||
return bestmu, f_hist, counteval | |||
end | |||
BestSolution.__index = BestSolution | |||
function BestSolution.new(x, f, evals) | |||
local self = setmetatable({}, BestSolution) | |||
self.x = x | |||
self.f = f | |||
self.evals = evals | |||
return self | |||
end | |||
function BestSolution.update(self, arx, arf, evals) | |||
--[[initialize the best solution with `x`, `f`, and `evals`. | |||
Better solutions have smaller `f`-values.--]] | |||
if self.f == nil or arf < self.f then | |||
self.x = arx:clone() | |||
self.f = arf | |||
self.evals = evals | |||
end | |||
return self | |||
end | |||
function BestSolution.get(self) | |||
return self.x, self.f, self.evals | |||
end |
@@ -0,0 +1,109 @@ | |||
--[[ An implementation of `DE` (Differential Evolution), | |||
ARGS: | |||
-`opfunc` : a function that takes a single input (X), the point of | |||
evaluation, and returns f(X) and df/dX. Note that df/dX is not used | |||
-`x` : the initial point | |||
-`state.popsize`: population size. If this is left empty, 10*d will be used | |||
-`state.scaleFactor`: float, usually between 0.4 and 1 | |||
-`state.crossoverRate`: float, usually between 0.1 and 0.9 | |||
-`state.maxEval`: int, maximal number of function evaluations | |||
RETURN: | |||
- `x*` : the new `x` vector, at the optimal point | |||
- `f` : a table of all function values: | |||
`f[1]` is the value of the function before any optimization and | |||
`f[#f]` is the final fully optimized value, at `x*` | |||
]] | |||
require 'torch' | |||
function optim.de(opfunc, x, config, state) | |||
-- process input parameters | |||
local config = config or {} | |||
local state = state | |||
local popsize = config.popsize -- population size | |||
local scaleFactor = config.scaleFactor -- scale factor | |||
local crossoverRate = config.crossoverRate -- crossover rate | |||
local maxFEs = tonumber(config.maxFEs) -- maximal number of function evaluations | |||
local maxRegion = config.maxRegion -- upper bound of search region | |||
local minRegion = config.minRegion -- lower bound of search region | |||
local xmean = x:clone():view(-1) -- distribution mean, a flattened copy | |||
local D = xmean:size(1) -- number of objective variables/problem dimension | |||
if config.popsize == nil then | |||
popsize = 10 * D | |||
end | |||
if config.maxRegion == nil then | |||
maxRegion = 30 | |||
end | |||
if config.minRegion == nil then | |||
minRegion = -30 | |||
end | |||
-- Initialize population | |||
local fx = x.new(maxFEs) | |||
local pop = x.new(popsize, D) | |||
local children = x.new(popsize, D) | |||
local fitness = x.new(popsize) | |||
local children_fitness = x.new(popsize) | |||
local fes = 1 -- number of function evaluations | |||
local best_fitness | |||
local best_solution = x.new(D) | |||
-- Initialize population and evaluate the its fitness value | |||
local gen = torch.Generator() | |||
torch.manualSeed(gen, 1) | |||
pop:uniform(gen, minRegion, maxRegion) | |||
for i = 1, popsize do | |||
fitness[i] = opfunc(pop[i]) | |||
fx[fes] = fitness[i] | |||
fes = fes + 1 | |||
end | |||
-- Find the best solution | |||
local index | |||
best_fitness, index = fitness:max(1) | |||
best_fitness = best_fitness[1] | |||
index = index[1] | |||
best_solution:copy(pop[index]) | |||
-- Main loop | |||
while fes < maxFEs do | |||
local r1, r2 | |||
for i = 1, popsize do | |||
repeat | |||
r1 = torch.random(gen, 1, popsize) | |||
until(r1 ~= i) | |||
repeat | |||
r2 = torch.random(gen, 1, popsize) | |||
until(r2 ~= r1 and r2 ~= i) | |||
local jrand = torch.random(gen, 1, D) | |||
for j = 1, D do | |||
if torch.uniform(gen, 0, 1) < crossoverRate or i == jrand then | |||
children[i][j] = best_solution[j] + scaleFactor * (pop[r1][j] - pop[r2][j]) | |||
else | |||
children[i][j] = pop[i][j] | |||
end | |||
end | |||
children_fitness[i] = opfunc(children[i]) | |||
fx[fes] = children_fitness[i] | |||
fes = fes + 1 | |||
end | |||
for i = 1, popsize do | |||
if children_fitness[i] <= fitness[i] then | |||
pop[i]:copy(children[i]) | |||
fitness[i] = children_fitness[i] | |||
if fitness[i] < best_fitness then | |||
best_fitness = fitness[i] | |||
best_solution:copy(children[i]) | |||
end | |||
end | |||
end | |||
end | |||
return best_solution, fx | |||
end |
@@ -0,0 +1,192 @@ | |||
--[[ FISTA with backtracking line search | |||
- `f` : smooth function | |||
- `g` : non-smooth function | |||
- `pl` : minimizer of intermediate problem Q(x,y) | |||
- `xinit` : initial point | |||
- `params` : table of parameters (**optional**) | |||
- `params.L` : 1/(step size) for ISTA/FISTA iteration (0.1) | |||
- `params.Lstep` : step size multiplier at each iteration (1.5) | |||
- `params.maxiter` : max number of iterations (50) | |||
- `params.maxline` : max number of line search iterations per iteration (20) | |||
- `params.errthres`: Error thershold for convergence check (1e-4) | |||
- `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true) | |||
- `params.verbose` : store each iteration solution and print detailed info (false) | |||
On output, `params` will contain these additional fields that can be reused. | |||
- `params.L` : last used L value will be written. | |||
These are temporary storages needed by the algo and if the same params object is | |||
passed a second time, these same storages will be used without new allocation. | |||
- `params.xkm` : previous iterarion point | |||
- `params.y` : fista iteration | |||
- `params.ply` : ply = pl(y - 1/L grad(f)) | |||
Returns the solution x and history of {function evals, number of line search ,...} | |||
Algorithm is published in | |||
@article{beck-fista-09, | |||
Author = {Beck, Amir and Teboulle, Marc}, | |||
Journal = {SIAM J. Img. Sci.}, | |||
Number = {1}, | |||
Pages = {183--202}, | |||
Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems}, | |||
Volume = {2}, | |||
Year = {2009}} | |||
]] | |||
function optim.FistaLS(f, g, pl, xinit, params) | |||
local params = params or {} | |||
local L = params.L or 0.1 | |||
local Lstep = params.Lstep or 1.5 | |||
local maxiter = params.maxiter or 50 | |||
local maxline = params.maxline or 20 | |||
local errthres = params.errthres or 1e-4 | |||
local doFistaUpdate = params.doFistaUpdate | |||
local verbose = params.verbose | |||
-- temporary allocations | |||
params.xkm = params.xkm or torch.Tensor() | |||
params.y = params.y or torch.Tensor() | |||
params.ply = params.ply or torch.Tensor() | |||
local xkm = params.xkm -- previous iteration | |||
local y = params.y -- fista iteration | |||
local ply = params.ply -- soft shrinked y | |||
-- we start from all zeros | |||
local xk = xinit | |||
xkm:resizeAs(xk):zero() | |||
ply:resizeAs(xk):zero() | |||
y:resizeAs(xk):zero() | |||
local history = {} -- keep track of stuff | |||
local niter = 0 -- number of iterations done | |||
local converged = false -- are we done? | |||
local tk = 1 -- momentum param for FISTA | |||
local tkp = 0 | |||
local gy = g(y) | |||
local fval = math.huge -- fval = f+g | |||
while not converged and niter < maxiter do | |||
-- run through smooth function (code is input, input is target) | |||
-- get derivatives from smooth function | |||
local fy,gfy = f(y,'dx') | |||
--local gfy = f(y) | |||
local fply = 0 | |||
local gply = 0 | |||
local Q = 0 | |||
---------------------------------------------- | |||
-- do line search to find new current location starting from fista loc | |||
local nline = 0 | |||
local linesearchdone = false | |||
while not linesearchdone do | |||
-- take a step in gradient direction of smooth function | |||
ply:copy(y) | |||
ply:add(-1/L,gfy) | |||
-- and solve for minimum of auxiliary problem | |||
pl(ply,L) | |||
-- this is candidate for new current iteration | |||
xk:copy(ply) | |||
-- evaluate this point F(ply) | |||
fply = f(ply) | |||
-- ply - y | |||
ply:add(-1, y) | |||
-- <ply-y , \Grad(f(y))> | |||
local Q2 = gfy:dot(ply) | |||
-- L/2 ||beta-y||^2 | |||
local Q3 = L/2 * ply:dot(ply) | |||
-- Q(beta,y) = F(y) + <beta-y , \Grad(F(y))> + L/2||beta-y||^2 + G(beta) | |||
Q = fy + Q2 + Q3 | |||
if verbose then | |||
print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3)) | |||
end | |||
-- check if F(beta) < Q(pl(y),\t) | |||
if fply <= Q then --and Fply + Gply <= F then | |||
-- now evaluate G here | |||
linesearchdone = true | |||
elseif nline >= maxline then | |||
linesearchdone = true | |||
xk:copy(xkm) -- if we can't find a better point, current iter = previous iter | |||
--print('oops') | |||
else | |||
L = L * Lstep | |||
end | |||
nline = nline + 1 | |||
end | |||
-- end line search | |||
--------------------------------------------- | |||
--------------------------------------------- | |||
-- FISTA | |||
--------------------------------------------- | |||
if doFistaUpdate then | |||
-- do the FISTA step | |||
tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2 | |||
-- x(k-1) = x(k-1) - x(k) | |||
xkm:add(-1,xk) | |||
-- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k)) | |||
y:copy(xk) | |||
y:add( (1-tk)/tkp , xkm) | |||
-- store for next iterations | |||
-- x(k-1) = x(k) | |||
xkm:copy(xk) | |||
else | |||
y:copy(xk) | |||
end | |||
-- t(k) = t(k+1) | |||
tk = tkp | |||
fply = f(y) | |||
gply = g(y) | |||
if verbose then | |||
print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply)) | |||
end | |||
niter = niter + 1 | |||
-- bookeeping | |||
fval = fply + gply | |||
history[niter] = {} | |||
history[niter].nline = nline | |||
history[niter].L = L | |||
history[niter].F = fval | |||
history[niter].Fply = fply | |||
history[niter].Gply = gply | |||
history[niter].Q = Q | |||
params.L = L | |||
if verbose then | |||
history[niter].xk = xk:clone() | |||
history[niter].y = y:clone() | |||
end | |||
-- are we done? | |||
if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then | |||
converged = true | |||
xinit:copy(y) | |||
return y,history | |||
end | |||
if niter >= maxiter then | |||
xinit:copy(y) | |||
return y,history | |||
end | |||
--if niter > 1 and history[niter].F > history[niter-1].F then | |||
--print(niter, 'This was supposed to be a convex function, we are going up') | |||
--converged = true | |||
--return xk,history | |||
--end | |||
end | |||
error('not supposed to be here') | |||
end | |||
@@ -0,0 +1,33 @@ | |||
require 'torch' | |||
optim = {} | |||
-- optimizations | |||
require('optim.sgd') | |||
require('optim.cg') | |||
require('optim.asgd') | |||
require('optim.nag') | |||
require('optim.fista') | |||
require('optim.lbfgs') | |||
require('optim.adagrad') | |||
require('optim.rprop') | |||
require('optim.adam') | |||
require('optim.adamax') | |||
require('optim.rmsprop') | |||
require('optim.adadelta') | |||
require('optim.cmaes') | |||
require('optim.de') | |||
-- line search functions | |||
require('optim.lswolfe') | |||
-- helpers | |||
require('optim.polyinterp') | |||
require('optim.checkgrad') | |||
-- tools | |||
require('optim.ConfusionMatrix') | |||
require('optim.Logger') | |||
return optim |
@@ -0,0 +1,268 @@ | |||
--[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt) | |||
This implementation of L-BFGS relies on a user-provided line | |||
search function (state.lineSearch). If this function is not | |||
provided, then a simple learningRate is used to produce fixed | |||
size steps. Fixed size steps are much less costly than line | |||
searches, and can be useful for stochastic problems. | |||
The learning rate is used even when a line search is provided. | |||
This is also useful for large-scale stochastic problems, where | |||
opfunc is a noisy approximation of f(x). In that case, the learning | |||
rate allows a reduction of confidence in the step size. | |||
ARGS: | |||
- `opfunc` : a function that takes a single input (X), the point of | |||
evaluation, and returns f(X) and df/dX | |||
- `x` : the initial point | |||
- `state` : a table describing the state of the optimizer; after each | |||
call the state is modified | |||
- `state.maxIter` : Maximum number of iterations allowed | |||
- `state.maxEval` : Maximum number of function evaluations | |||
- `state.tolFun` : Termination tolerance on the first-order optimality | |||
- `state.tolX` : Termination tol on progress in terms of func/param changes | |||
- `state.lineSearch` : A line search function | |||
- `state.learningRate` : If no line search provided, then a fixed step size is used | |||
RETURN: | |||
- `x*` : the new `x` vector, at the optimal point | |||
- `f` : a table of all function values: | |||
`f[1]` is the value of the function before any optimization and | |||
`f[#f]` is the final fully optimized value, at `x*` | |||
(Clement Farabet, 2012) | |||
]] | |||
function optim.lbfgs(opfunc, x, config, state) | |||
-- get/update state | |||
local config = config or {} | |||
local state = state or config | |||
local maxIter = tonumber(config.maxIter) or 20 | |||
local maxEval = tonumber(config.maxEval) or maxIter*1.25 | |||
local tolFun = config.tolFun or 1e-5 | |||
local tolX = config.tolX or 1e-9 | |||
local nCorrection = config.nCorrection or 100 | |||
local lineSearch = config.lineSearch | |||
local lineSearchOpts = config.lineSearchOptions | |||
local learningRate = config.learningRate or 1 | |||
local isverbose = config.verbose or false | |||
state.funcEval = state.funcEval or 0 | |||
state.nIter = state.nIter or 0 | |||
-- verbose function | |||
local verbose | |||
if isverbose then | |||
verbose = function(...) print('<optim.lbfgs> ', ...) end | |||
else | |||
verbose = function() end | |||
end | |||
-- import some functions | |||
local abs = math.abs | |||
local min = math.min | |||
-- evaluate initial f(x) and df/dx | |||
local f,g = opfunc(x) | |||
local f_hist = {f} | |||
local currentFuncEval = 1 | |||
state.funcEval = state.funcEval + 1 | |||
local p = g:size(1) | |||
-- check optimality of initial point | |||
state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1 | |||
tmp1:copy(g):abs() | |||
if tmp1:sum() <= tolFun then | |||
-- optimality condition below tolFun | |||
verbose('optimality condition below tolFun') | |||
return x,f_hist | |||
end | |||
if not state.dir_bufs then | |||
-- reusable buffers for y's and s's, and their histories | |||
verbose('creating recyclable direction/step/history buffers') | |||
state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1) | |||
state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1) | |||
for i=1,#state.dir_bufs do | |||
state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1) | |||
state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1) | |||
end | |||
end | |||
-- variables cached in state (for tracing) | |||
local d = state.d | |||
local t = state.t | |||
local old_dirs = state.old_dirs | |||
local old_stps = state.old_stps | |||
local Hdiag = state.Hdiag | |||
local g_old = state.g_old | |||
local f_old = state.f_old | |||
-- optimize for a max of maxIter iterations | |||
local nIter = 0 | |||
while nIter < maxIter do | |||
-- keep track of nb of iterations | |||
nIter = nIter + 1 | |||
state.nIter = state.nIter + 1 | |||
------------------------------------------------------------ | |||
-- compute gradient descent direction | |||
------------------------------------------------------------ | |||
if state.nIter == 1 then | |||
d = g:clone():mul(-1) -- -g | |||
old_dirs = {} | |||
old_stps = {} | |||
Hdiag = 1 | |||
else | |||
-- do lbfgs update (update memory) | |||
local y = table.remove(state.dir_bufs) -- pop | |||
local s = table.remove(state.stp_bufs) | |||
y:add(g, -1, g_old) -- g - g_old | |||
s:mul(d, t) -- d*t | |||
local ys = y:dot(s) -- y*s | |||
if ys > 1e-10 then | |||
-- updating memory | |||
if #old_dirs == nCorrection then | |||
-- shift history by one (limited-memory) | |||
local removed1 = table.remove(old_dirs, 1) | |||
local removed2 = table.remove(old_stps, 1) | |||
table.insert(state.dir_bufs, removed1) | |||
table.insert(state.stp_bufs, removed2) | |||
end | |||
-- store new direction/step | |||
table.insert(old_dirs, s) | |||
table.insert(old_stps, y) | |||
-- update scale of initial Hessian approximation | |||
Hdiag = ys / y:dot(y) -- (y*y) | |||
else | |||
-- put y and s back into the buffer pool | |||
table.insert(state.dir_bufs, y) | |||
table.insert(state.stp_bufs, s) | |||
end | |||
-- compute the approximate (L-BFGS) inverse Hessian | |||
-- multiplied by the gradient | |||
local k = #old_dirs | |||
-- need to be accessed element-by-element, so don't re-type tensor: | |||
state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro | |||
for i = 1,k do | |||
ro[i] = 1 / old_stps[i]:dot(old_dirs[i]) | |||
end | |||
-- iteration in L-BFGS loop collapsed to use just one buffer | |||
local q = tmp1 -- reuse tmp1 for the q buffer | |||
-- need to be accessed element-by-element, so don't re-type tensor: | |||
state.al = state.al or torch.zeros(nCorrection) local al = state.al | |||
q:mul(g, -1) -- -g | |||
for i = k,1,-1 do | |||
al[i] = old_dirs[i]:dot(q) * ro[i] | |||
q:add(-al[i], old_stps[i]) | |||
end | |||
-- multiply by initial Hessian | |||
r = d -- share the same buffer, since we don't need the old d | |||
r:mul(q, Hdiag) -- q[1] * Hdiag | |||
for i = 1,k do | |||
local be_i = old_stps[i]:dot(r) * ro[i] | |||
r:add(al[i]-be_i, old_dirs[i]) | |||
end | |||
-- final direction is in r/d (same object) | |||
end | |||
g_old = g_old or g:clone() | |||
g_old:copy(g) | |||
f_old = f | |||
------------------------------------------------------------ | |||
-- compute step length | |||
------------------------------------------------------------ | |||
-- directional derivative | |||
local gtd = g:dot(d) -- g * d | |||
-- check that progress can be made along that direction | |||
if gtd > -tolX then | |||
break | |||
end | |||
-- reset initial guess for step size | |||
if state.nIter == 1 then | |||
tmp1:copy(g):abs() | |||
t = min(1,1/tmp1:sum()) * learningRate | |||
else | |||
t = learningRate | |||
end | |||
-- optional line search: user function | |||
local lsFuncEval = 0 | |||
if lineSearch and type(lineSearch) == 'function' then | |||
-- perform line search, using user function | |||
f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts) | |||
table.insert(f_hist, f) | |||
else | |||
-- no line search, simply move with fixed-step | |||
x:add(t,d) | |||
if nIter ~= maxIter then | |||
-- re-evaluate function only if not in last iteration | |||
-- the reason we do this: in a stochastic setting, | |||
-- no use to re-evaluate that function here | |||
f,g = opfunc(x) | |||
lsFuncEval = 1 | |||
table.insert(f_hist, f) | |||
end | |||
end | |||
-- update func eval | |||
currentFuncEval = currentFuncEval + lsFuncEval | |||
state.funcEval = state.funcEval + lsFuncEval | |||
------------------------------------------------------------ | |||
-- check conditions | |||
------------------------------------------------------------ | |||
if nIter == maxIter then | |||
-- no use to run tests | |||
verbose('reached max number of iterations') | |||
break | |||
end | |||
if currentFuncEval >= maxEval then | |||
-- max nb of function evals | |||
verbose('max nb of function evals') | |||
break | |||
end | |||
tmp1:copy(g):abs() | |||
if tmp1:sum() <= tolFun then | |||
-- check optimality | |||
verbose('optimality condition below tolFun') | |||
break | |||
end | |||
tmp1:copy(d):mul(t):abs() | |||
if tmp1:sum() <= tolX then | |||
-- step size below tolX | |||
verbose('step size below tolX') | |||
break | |||
end | |||
if abs(f-f_old) < tolX then | |||
-- function value changing less than tolX | |||
verbose('function value changing less than tolX') | |||
break | |||
end | |||
end | |||
-- save state | |||
state.old_dirs = old_dirs | |||
state.old_stps = old_stps | |||
state.Hdiag = Hdiag | |||
state.g_old = g_old | |||
state.f_old = f_old | |||
state.t = t | |||
state.d = d | |||
-- return optimal x, and history of f(x) | |||
return x,f_hist,currentFuncEval | |||
end |
@@ -0,0 +1,192 @@ | |||
--[[ A Line Search satisfying the Wolfe conditions | |||
ARGS: | |||
- `opfunc` : a function (the objective) that takes a single input (X), | |||
the point of evaluation, and returns f(X) and df/dX | |||
- `x` : initial point / starting location | |||
- `t` : initial step size | |||
- `d` : descent direction | |||
- `f` : initial function value | |||
- `g` : gradient at initial location | |||
- `gtd` : directional derivative at starting location | |||
- `options.c1` : sufficient decrease parameter | |||
- `options.c2` : curvature parameter | |||
- `options.tolX` : minimum allowable step length | |||
- `options.maxIter` : maximum nb of iterations | |||
RETURN: | |||
- `f` : function value at x+t*d | |||
- `g` : gradient value at x+t*d | |||
- `x` : the next x (=x+t*d) | |||
- `t` : the step length | |||
- `lsFuncEval` : the number of function evaluations | |||
]] | |||
function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options) | |||
-- options | |||
options = options or {} | |||
local c1 = options.c1 or 1e-4 | |||
local c2 = options.c2 or 0.9 | |||
local tolX = options.tolX or 1e-9 | |||
local maxIter = options.maxIter or 20 | |||
local isverbose = options.verbose or false | |||
-- some shortcuts | |||
local abs = torch.abs | |||
local min = math.min | |||
local max = math.max | |||
-- verbose function | |||
local function verbose(...) | |||
if isverbose then print('<optim.lswolfe> ', ...) end | |||
end | |||
-- evaluate objective and gradient using initial step | |||
local x_init = x:clone() | |||
x:add(t,d) | |||
local f_new,g_new = opfunc(x) | |||
local lsFuncEval = 1 | |||
local gtd_new = g_new * d | |||
-- bracket an interval containing a point satisfying the Wolfe | |||
-- criteria | |||
local LSiter,t_prev,done = 0,0,false | |||
local f_prev,g_prev,gtd_prev = f,g:clone(),gtd | |||
local bracket,bracketFval,bracketGval | |||
while LSiter < maxIter do | |||
-- check conditions: | |||
if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then | |||
bracket = x.new{t_prev,t} | |||
bracketFval = x.new{f_prev,f_new} | |||
bracketGval = x.new(2,g_new:size(1)) | |||
bracketGval[1] = g_prev | |||
bracketGval[2] = g_new | |||
break | |||
elseif abs(gtd_new) <= -c2*gtd then | |||
bracket = x.new{t} | |||
bracketFval = x.new{f_new} | |||
bracketGval = x.new(1,g_new:size(1)) | |||
bracketGval[1] = g_new | |||
done = true | |||
break | |||
elseif gtd_new >= 0 then | |||
bracket = x.new{t_prev,t} | |||
bracketFval = x.new{f_prev,f_new} | |||
bracketGval = x.new(2,g_new:size(1)) | |||
bracketGval[1] = g_prev | |||
bracketGval[2] = g_new | |||
break | |||
end | |||
-- interpolate: | |||
local tmp = t_prev | |||
t_prev = t | |||
local minStep = t + 0.01*(t-tmp) | |||
local maxStep = t*10 | |||
t = optim.polyinterp(x.new{{tmp,f_prev,gtd_prev}, | |||
{t,f_new,gtd_new}}, | |||
minStep, maxStep) | |||
-- next step: | |||
f_prev = f_new | |||
g_prev = g_new:clone() | |||
gtd_prev = gtd_new | |||
x[{}] = x_init | |||
x:add(t,d) | |||
f_new,g_new = opfunc(x) | |||
lsFuncEval = lsFuncEval + 1 | |||
gtd_new = g_new * d | |||
LSiter = LSiter + 1 | |||
end | |||
-- reached max nb of iterations? | |||
if LSiter == maxIter then | |||
bracket = x.new{0,t} | |||
bracketFval = x.new{f,f_new} | |||
bracketGval = x.new(2,g_new:size(1)) | |||
bracketGval[1] = g | |||
bracketGval[2] = g_new | |||
end | |||
-- zoom phase: we now have a point satisfying the criteria, or | |||
-- a bracket around it. We refine the bracket until we find the | |||
-- exact point satisfying the criteria | |||
local insufProgress = false | |||
local LOposRemoved = 0 | |||
while not done and LSiter < maxIter do | |||
-- find high and low points in bracket | |||
local f_LO,LOpos = bracketFval:min(1) | |||
LOpos = LOpos[1] f_LO = f_LO[1] | |||
local HIpos = -LOpos+3 | |||
-- compute new trial value | |||
t = optim.polyinterp(x.new{{bracket[1],bracketFval[1],bracketGval[1]*d}, | |||
{bracket[2],bracketFval[2],bracketGval[2]*d}}) | |||
-- test what we are making sufficient progress | |||
if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then | |||
if insufProgress or t>=bracket:max() or t <= bracket:min() then | |||
if abs(t-bracket:max()) < abs(t-bracket:min()) then | |||
t = bracket:max()-0.1*(bracket:max()-bracket:min()) | |||
else | |||
t = bracket:min()+0.1*(bracket:max()-bracket:min()) | |||
end | |||
insufProgress = false | |||
else | |||
insufProgress = true | |||
end | |||
else | |||
insufProgress = false | |||
end | |||
-- Evaluate new point | |||
x[{}] = x_init | |||
x:add(t,d) | |||
f_new,g_new = opfunc(x) | |||
lsFuncEval = lsFuncEval + 1 | |||
gtd_new = g_new * d | |||
LSiter = LSiter + 1 | |||
if f_new > f + c1*t*gtd or f_new >= f_LO then | |||
-- Armijo condition not satisfied or not lower than lowest point | |||
bracket[HIpos] = t | |||
bracketFval[HIpos] = f_new | |||
bracketGval[HIpos] = g_new | |||
else | |||
if abs(gtd_new) <= - c2*gtd then | |||
-- Wolfe conditions satisfied | |||
done = true | |||
elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then | |||
-- Old HI becomes new LO | |||
bracket[HIpos] = bracket[LOpos] | |||
bracketFval[HIpos] = bracketFval[LOpos] | |||
bracketGval[HIpos] = bracketGval[LOpos] | |||
end | |||
-- New point becomes new LO | |||
bracket[LOpos] = t | |||
bracketFval[LOpos] = f_new | |||
bracketGval[LOpos] = g_new | |||
end | |||
-- done? | |||
if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then | |||
break | |||
end | |||
end | |||
-- be verbose | |||
if LSiter == maxIter then | |||
verbose('reached max number of iterations') | |||
end | |||
-- return stuff | |||
local _,LOpos = bracketFval:min(1) | |||
LOpos = LOpos[1] | |||
t = bracket[LOpos] | |||
f_new = bracketFval[LOpos] | |||
g_new = bracketGval[LOpos] | |||
x[{}] = x_init | |||
x:add(t,d) | |||
return f_new,g_new,x,t,lsFuncEval | |||
end |
@@ -0,0 +1,86 @@ | |||
---------------------------------------------------------------------- | |||
-- An implementation of SGD adapted with features of Nesterov's | |||
-- Accelerated Gradient method, based on the paper | |||
-- On the Importance of Initialization and Momentum in Deep Learning | |||
-- Sutsveker et. al., ICML 2013 | |||
-- | |||
-- ARGS: | |||
-- opfunc : a function that takes a single input (X), the point of | |||
-- evaluation, and returns f(X) and df/dX | |||
-- x : the initial point | |||
-- state : a table describing the state of the optimizer; after each | |||
-- call the state is modified | |||
-- state.learningRate : learning rate | |||
-- state.learningRateDecay : learning rate decay | |||
-- state.weightDecay : weight decay | |||
-- state.momentum : momentum | |||
-- state.learningRates : vector of individual learning rates | |||
-- | |||
-- RETURN: | |||
-- x : the new x vector | |||
-- f(x) : the function, evaluated before the update | |||
-- | |||
-- (Dilip Krishnan, 2013) | |||
-- | |||
function optim.nag(opfunc, x, config, state) | |||
-- (0) get/update state | |||
local config = config or {} | |||
local state = state or config | |||
local lr = config.learningRate or 1e-3 | |||
local lrd = config.learningRateDecay or 0 | |||
local wd = config.weightDecay or 0 | |||
local mom = config.momentum or 0.9 | |||
local damp = config.dampening or mom | |||
local lrs = config.learningRates | |||
state.evalCounter = state.evalCounter or 0 | |||
local nevals = state.evalCounter | |||
if mom <= 0 then | |||
error('Momentum must be positive for Nesterov Accelerated Gradient') | |||
end | |||
-- (1) evaluate f(x) and df/dx | |||
-- first step in the direction of the momentum vector | |||
if state.dfdx then | |||
x:add(mom, state.dfdx) | |||
end | |||
-- then compute gradient at that point | |||
-- comment out the above line to get the original SGD | |||
local fx,dfdx = opfunc(x) | |||
-- (2) weight decay | |||
if wd ~= 0 then | |||
dfdx:add(wd, x) | |||
end | |||
-- (3) learning rate decay (annealing) | |||
local clr = lr / (1 + nevals*lrd) | |||
-- (4) apply momentum | |||
if not state.dfdx then | |||
state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0) | |||
else | |||
state.dfdx:mul(mom) | |||
end | |||
-- (5) parameter update with single or individual learning rates | |||
if lrs then | |||
if not state.deltaParameters then | |||
state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) | |||
end | |||
state.deltaParameters:copy(lrs):cmul(dfdx) | |||
x:add(-clr, state.deltaParameters) | |||
state.dfdx:add(-clr, state.deltaParameters) | |||
else | |||
x:add(-clr, dfdx) | |||
state.dfdx:add(-clr, dfdx) | |||
end | |||
-- (6) update evaluation counter | |||
state.evalCounter = state.evalCounter + 1 | |||
-- return x, f(x) before optimization | |||
return x,{fx} | |||
end |
@@ -0,0 +1,212 @@ | |||
local function isreal(x) | |||
return x == x | |||
end | |||
local function isnan(x) | |||
return not x == x | |||
end | |||
local function roots(c) | |||
local tol=1e-12 | |||
c[torch.lt(torch.abs(c),tol)]=0 | |||
local nonzero = torch.ne(c,0) | |||
if nonzero:max() == 0 then | |||
return 0 | |||
end | |||
-- first non-zero | |||
local _,pos = torch.max(nonzero,1) | |||
pos = pos[1] | |||
c=c[{ {pos,-1} }] | |||
local nz = 0 | |||
for i=c:size(1),1,-1 do | |||
if c[i] ~= 0 then | |||
break | |||
else | |||
nz = nz + 1 | |||
end | |||
end | |||
c=c[{ {1,c:size(1)-nz} }] | |||
local n = c:size(1)-1 | |||
if n == 1 then | |||
local e = c.new({{-c[2]/c[1], 0}}) | |||
if nz > 0 then | |||
return torch.cat(e, c.new(nz, 2):zero(), 1) | |||
else | |||
return e | |||
end | |||
elseif n > 1 then | |||
local A = torch.diag(c.new(n-1):fill(1),-1) | |||
A[1] = -c[{ {2,n+1} }]/c[1]; | |||
local e = torch.eig(A,'N') | |||
if nz > 0 then | |||
return torch.cat(e, c.new(nz,2):zero(), 1) | |||
else | |||
return e | |||
end | |||
else | |||
return c.new(nz,2):zero() | |||
end | |||
end | |||
local function real(x) | |||
if type(x) == number then return x end | |||
return x[{ {} , 1}] | |||
end | |||
local function imag(x) | |||
if type(x) == 'number' then return 0 end | |||
if x:nDimension() == 1 then | |||
return x.new(x:size(1)):zero() | |||
else | |||
return x[{ {}, 2}] | |||
end | |||
end | |||
local function polyval(p,x) | |||
local pwr = p:size(1) | |||
if type(x) == 'number' then | |||
local val = 0 | |||
p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end) | |||
return val | |||
else | |||
local val = x.new(x:size(1)) | |||
p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end) | |||
return val | |||
end | |||
end | |||
---------------------------------------------------------------------- | |||
-- Minimum of interpolating polynomial based on function and | |||
-- derivative values | |||
-- | |||
-- ARGS: | |||
-- points : N triplets (x,f,g), must be a Tensor | |||
-- xmin : min value that brackets minimum (default: min of points) | |||
-- xmax : max value that brackets maximum (default: max of points) | |||
-- | |||
-- RETURN: | |||
-- minPos : position of minimum | |||
-- | |||
function optim.polyinterp(points,xminBound,xmaxBound) | |||
-- locals | |||
local sqrt = torch.sqrt | |||
local mean = torch.mean | |||
local max = math.max | |||
local min = math.min | |||
-- nb of points / order of polynomial | |||
local nPoints = points:size(1) | |||
local order = nPoints*2-1 | |||
-- returned values | |||
local minPos | |||
-- Code for most common case: | |||
-- + cubic interpolation of 2 points w/ function and derivative values for both | |||
-- + no xminBound/xmaxBound | |||
if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then | |||
-- Solution in this case (where x2 is the farthest point): | |||
-- d1 = g1 + g2 - 3*(f1-f2)/(x1-x2); | |||
-- d2 = sqrt(d1^2 - g1*g2); | |||
-- minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2)); | |||
-- t_new = min(max(minPos,x1),x2); | |||
local minVal,minPos = points[{ {},1 }]:min(1) | |||
minVal = minVal[1] minPos = minPos[1] | |||
local notMinPos = -minPos+3; | |||
local d1 = points[{minPos,3}] + points[{notMinPos,3}] | |||
- 3*(points[{minPos,2}]-points[{notMinPos,2}]) | |||
/ (points[{minPos,1}]-points[{notMinPos,1}]); | |||
local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]); | |||
if isreal(d2) then -- isreal() | |||
local t = points[{notMinPos,1}] - (points[{notMinPos,1}] | |||
- points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1) | |||
/ (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2)) | |||
minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}]) | |||
else | |||
minPos = mean(points[{{},1}]) | |||
end | |||
return minPos | |||
end | |||
-- TODO: get the code below to work! | |||
--error('<optim.polyinterp> extrapolation not implemented yet...') | |||
-- Compute Bounds of Interpolation Area | |||
local xmin = points[{{},1}]:min() | |||
local xmax = points[{{},1}]:max() | |||
xminBound = xminBound or xmin | |||
xmaxBound = xmaxBound or xmax | |||
-- Add constraints on function values | |||
local A = points.new(nPoints*2,order+1):zero() | |||
local b = points.new(nPoints*2,1):zero() | |||
for i = 1,nPoints do | |||
local constraint = points.new(order+1):zero() | |||
for j = order,0,-1 do | |||
constraint[order-j+1] = points[{i,1}]^j | |||
end | |||
A[i] = constraint | |||
b[i] = points[{i,2}] | |||
end | |||
-- Add constraints based on derivatives | |||
for i = 1,nPoints do | |||
local constraint = points.new(order+1):zero() | |||
for j = 1,order do | |||
constraint[j] = (order-j+1)*points[{i,1}]^(order-j) | |||
end | |||
A[nPoints+i] = constraint | |||
b[nPoints+i] = points[{i,3}] | |||
end | |||
-- Find interpolating polynomial | |||
local res = torch.gels(b,A) | |||
local params = res[{ {1,nPoints*2} }]:squeeze() | |||
params[torch.le(torch.abs(params),1e-12)]=0 | |||
-- Compute Critical Points | |||
local dParams = points.new(order):zero(); | |||
for i = 1,params:size(1)-1 do | |||
dParams[i] = params[i]*(order-i+1) | |||
end | |||
-- nan/inf? | |||
local nans = false | |||
if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then | |||
nans = true | |||
end | |||
local cp = torch.cat(points.new{xminBound,xmaxBound},points[{{},1}]) | |||
if not nans then | |||
local cproots = roots(dParams) | |||
local cpi = points.new(cp:size(1),2):zero() | |||
cpi[{ {1,cp:size(1)} , 1 }] = cp | |||
cp = torch.cat(cpi,cproots,1) | |||
end | |||
-- Test Critical Points | |||
local fmin = math.huge | |||
-- Default to Bisection if no critical points valid: | |||
minPos = (xminBound+xmaxBound)/2 | |||
for i = 1,cp:size(1) do | |||
local xCP = cp[{ {i,i} , {} }] | |||
local ixCP = imag(xCP)[1] | |||
local rxCP = real(xCP)[1] | |||
if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then | |||
local fCP = polyval(params,rxCP) | |||
if fCP < fmin then | |||
minPos = rxCP | |||
fmin = fCP | |||
end | |||
end | |||
end | |||
return minPos,fmin | |||
end |
@@ -0,0 +1,58 @@ | |||
--[[ An implementation of RMSprop | |||
ARGS: | |||
- 'opfunc' : a function that takes a single input (X), the point | |||
of a evaluation, and returns f(X) and df/dX | |||
- 'x' : the initial point | |||
- 'config` : a table with configuration parameters for the optimizer | |||
- 'config.learningRate' : learning rate | |||
- 'config.alpha' : smoothing constant | |||
- 'config.epsilon' : value with which to initialise m | |||
- 'config.weightDecay' : weight decay | |||
- 'state' : a table describing the state of the optimizer; | |||
after each call the state is modified | |||
- 'state.m' : leaky sum of squares of parameter gradients, | |||
- 'state.tmp' : and the square root (with epsilon smoothing) | |||
RETURN: | |||
- `x` : the new x vector | |||
- `f(x)` : the function, evaluated before the update | |||
]] | |||
function optim.rmsprop(opfunc, x, config, state) | |||
-- (0) get/update state | |||
local config = config or {} | |||
local state = state or config | |||
local lr = config.learningRate or 1e-2 | |||
local alpha = config.alpha or 0.99 | |||
local epsilon = config.epsilon or 1e-8 | |||
local wd = config.weightDecay or 0 | |||
local mfill = config.initialMean or 0 | |||
-- (1) evaluate f(x) and df/dx | |||
local fx, dfdx = opfunc(x) | |||
-- (2) weight decay | |||
if wd ~= 0 then | |||
dfdx:add(wd, x) | |||
end | |||
-- (3) initialize mean square values and square gradient storage | |||
if not state.m then | |||
state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(mfill) | |||
state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx) | |||
end | |||
-- (4) calculate new (leaky) mean squared values | |||
state.m:mul(alpha) | |||
state.m:addcmul(1.0-alpha, dfdx, dfdx) | |||
-- (5) perform update | |||
state.tmp:sqrt(state.m):add(epsilon) | |||
x:addcdiv(-lr, dfdx, state.tmp) | |||
-- return x*, f(x) before optimization | |||
return x, {fx} | |||
end |
@@ -0,0 +1,103 @@ | |||
--[[ A plain implementation of RPROP | |||
ARGS: | |||
- `opfunc` : a function that takes a single input (X), the point of | |||
evaluation, and returns f(X) and df/dX | |||
- `x` : the initial point | |||
- `state` : a table describing the state of the optimizer; after each | |||
call the state is modified | |||
- `state.stepsize` : initial step size, common to all components | |||
- `state.etaplus` : multiplicative increase factor, > 1 (default 1.2) | |||
- `state.etaminus` : multiplicative decrease factor, < 1 (default 0.5) | |||
- `state.stepsizemax` : maximum stepsize allowed (default 50) | |||
- `state.stepsizemin` : minimum stepsize allowed (default 1e-6) | |||
- `state.niter` : number of iterations (default 1) | |||
RETURN: | |||
- `x` : the new x vector | |||
- `f(x)` : the function, evaluated before the update | |||
(Martin Riedmiller, Koray Kavukcuoglu 2013) | |||
--]] | |||
function optim.rprop(opfunc, x, config, state) | |||
if config == nil and state == nil then | |||
print('no state table RPROP initializing') | |||
end | |||
-- (0) get/update state | |||
local config = config or {} | |||
local state = state or config | |||
local stepsize = config.stepsize or 0.1 | |||
local etaplus = config.etaplus or 1.2 | |||
local etaminus = config.etaminus or 0.5 | |||
local stepsizemax = config.stepsizemax or 50.0 | |||
local stepsizemin = config.stepsizemin or 1E-06 | |||
local niter = config.niter or 1 | |||
local hfx = {} | |||
for i=1,niter do | |||
-- (1) evaluate f(x) and df/dx | |||
local fx,dfdx = opfunc(x) | |||
-- init temp storage | |||
if not state.delta then | |||
state.delta = dfdx.new(dfdx:size()):zero() | |||
state.stepsize = dfdx.new(dfdx:size()):fill(stepsize) | |||
state.sign = dfdx.new(dfdx:size()) | |||
state.psign = torch.ByteTensor(dfdx:size()) | |||
state.nsign = torch.ByteTensor(dfdx:size()) | |||
state.zsign = torch.ByteTensor(dfdx:size()) | |||
state.dminmax = torch.ByteTensor(dfdx:size()) | |||
if torch.type(x)=='torch.CudaTensor' then | |||
-- Push to GPU | |||
state.psign = state.psign:cuda() | |||
state.nsign = state.nsign:cuda() | |||
state.zsign = state.zsign:cuda() | |||
state.dminmax = state.dminmax:cuda() | |||
end | |||
end | |||
-- sign of derivative from last step to this one | |||
torch.cmul(state.sign, dfdx, state.delta) | |||
torch.sign(state.sign, state.sign) | |||
-- get indices of >0, <0 and ==0 entries | |||
state.sign.gt(state.psign, state.sign, 0) | |||
state.sign.lt(state.nsign, state.sign, 0) | |||
state.sign.eq(state.zsign, state.sign, 0) | |||
-- get step size updates | |||
state.sign[state.psign] = etaplus | |||
state.sign[state.nsign] = etaminus | |||
state.sign[state.zsign] = 1 | |||
-- update stepsizes with step size updates | |||
state.stepsize:cmul(state.sign) | |||
-- threshold step sizes | |||
-- >50 => 50 | |||
state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax) | |||
state.stepsize[state.dminmax] = stepsizemax | |||
-- <1e-6 ==> 1e-6 | |||
state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin) | |||
state.stepsize[state.dminmax] = stepsizemin | |||
-- for dir<0, dfdx=0 | |||
-- for dir>=0 dfdx=dfdx | |||
dfdx[state.nsign] = 0 | |||
-- state.sign = sign(dfdx) | |||
torch.sign(state.sign,dfdx) | |||
-- update weights | |||
x:addcmul(-1,state.sign,state.stepsize) | |||
-- update state.dfdx with current dfdx | |||
state.delta:copy(dfdx) | |||
table.insert(hfx,fx) | |||
end | |||
-- return x*, f(x) before optimization | |||
return x,hfx | |||
end |
@@ -0,0 +1,90 @@ | |||
--[[ A plain implementation of SGD | |||
ARGS: | |||
- `opfunc` : a function that takes a single input (X), the point | |||
of a evaluation, and returns f(X) and df/dX | |||
- `x` : the initial point | |||
- `config` : a table with configuration parameters for the optimizer | |||
- `config.learningRate` : learning rate | |||
- `config.learningRateDecay` : learning rate decay | |||
- `config.weightDecay` : weight decay | |||
- `config.weightDecays` : vector of individual weight decays | |||
- `config.momentum` : momentum | |||
- `config.dampening` : dampening for momentum | |||
- `config.nesterov` : enables Nesterov momentum | |||
- `config.learningRates` : vector of individual learning rates | |||
- `state` : a table describing the state of the optimizer; after each | |||
call the state is modified | |||
- `state.evalCounter` : evaluation counter (optional: 0, by default) | |||
RETURN: | |||
- `x` : the new x vector | |||
- `f(x)` : the function, evaluated before the update | |||
(Clement Farabet, 2012) | |||
]] | |||
function optim.sgd(opfunc, x, config, state) | |||
-- (0) get/update state | |||
local config = config or {} | |||
local state = state or config | |||
local lr = config.learningRate or 1e-3 | |||
local lrd = config.learningRateDecay or 0 | |||
local wd = config.weightDecay or 0 | |||
local mom = config.momentum or 0 | |||
local damp = config.dampening or mom | |||
local nesterov = config.nesterov or false | |||
local lrs = config.learningRates | |||
local wds = config.weightDecays | |||
state.evalCounter = state.evalCounter or 0 | |||
local nevals = state.evalCounter | |||
assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening") | |||
-- (1) evaluate f(x) and df/dx | |||
local fx,dfdx = opfunc(x) | |||
-- (2) weight decay with single or individual parameters | |||
if wd ~= 0 then | |||
dfdx:add(wd, x) | |||
elseif wds then | |||
if not state.decayParameters then | |||
state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) | |||
end | |||
state.decayParameters:copy(wds):cmul(x) | |||
dfdx:add(state.decayParameters) | |||
end | |||
-- (3) apply momentum | |||
if mom ~= 0 then | |||
if not state.dfdx then | |||
state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx) | |||
else | |||
state.dfdx:mul(mom):add(1-damp, dfdx) | |||
end | |||
if nesterov then | |||
dfdx:add(mom, state.dfdx) | |||
else | |||
dfdx = state.dfdx | |||
end | |||
end | |||
-- (4) learning rate decay (annealing) | |||
local clr = lr / (1 + nevals*lrd) | |||
-- (5) parameter update with single or individual learning rates | |||
if lrs then | |||
if not state.deltaParameters then | |||
state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) | |||
end | |||
state.deltaParameters:copy(lrs):cmul(dfdx) | |||
x:add(-clr, state.deltaParameters) | |||
else | |||
x:add(-clr, dfdx) | |||
end | |||
-- (6) update evaluation counter | |||
state.evalCounter = state.evalCounter + 1 | |||
-- return x*, f(x) before optimization | |||
return x,{fx} | |||
end |