From 478687cd467a1c342b9ec1cefc3c5cd0528d9d14 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 6 Mar 2018 10:20:22 +0000 Subject: [PATCH] [Feature] Add torch-optim contrib package --- CMakeLists.txt | 1 + contrib/torch/optim/CMakeLists.txt | 5 + contrib/torch/optim/COPYRIGHT.txt | 35 +++ contrib/torch/optim/ConfusionMatrix.lua | 361 ++++++++++++++++++++++++ contrib/torch/optim/Logger.lua | 190 +++++++++++++ contrib/torch/optim/adadelta.lua | 55 ++++ contrib/torch/optim/adagrad.lua | 55 ++++ contrib/torch/optim/adam.lua | 72 +++++ contrib/torch/optim/adamax.lua | 66 +++++ contrib/torch/optim/asgd.lua | 73 +++++ contrib/torch/optim/cg.lua | 208 ++++++++++++++ contrib/torch/optim/checkgrad.lua | 52 ++++ contrib/torch/optim/cmaes.lua | 270 ++++++++++++++++++ contrib/torch/optim/de.lua | 109 +++++++ contrib/torch/optim/fista.lua | 192 +++++++++++++ contrib/torch/optim/init.lua | 33 +++ contrib/torch/optim/lbfgs.lua | 268 ++++++++++++++++++ contrib/torch/optim/lswolfe.lua | 192 +++++++++++++ contrib/torch/optim/nag.lua | 86 ++++++ contrib/torch/optim/polyinterp.lua | 212 ++++++++++++++ contrib/torch/optim/rmsprop.lua | 58 ++++ contrib/torch/optim/rprop.lua | 103 +++++++ contrib/torch/optim/sgd.lua | 90 ++++++ 23 files changed, 2786 insertions(+) create mode 100644 contrib/torch/optim/CMakeLists.txt create mode 100644 contrib/torch/optim/COPYRIGHT.txt create mode 100644 contrib/torch/optim/ConfusionMatrix.lua create mode 100644 contrib/torch/optim/Logger.lua create mode 100644 contrib/torch/optim/adadelta.lua create mode 100644 contrib/torch/optim/adagrad.lua create mode 100644 contrib/torch/optim/adam.lua create mode 100644 contrib/torch/optim/adamax.lua create mode 100644 contrib/torch/optim/asgd.lua create mode 100644 contrib/torch/optim/cg.lua create mode 100644 contrib/torch/optim/checkgrad.lua create mode 100644 contrib/torch/optim/cmaes.lua create mode 100644 contrib/torch/optim/de.lua create mode 100644 contrib/torch/optim/fista.lua create mode 100644 contrib/torch/optim/init.lua create mode 100644 contrib/torch/optim/lbfgs.lua create mode 100644 contrib/torch/optim/lswolfe.lua create mode 100644 contrib/torch/optim/nag.lua create mode 100644 contrib/torch/optim/polyinterp.lua create mode 100644 contrib/torch/optim/rmsprop.lua create mode 100644 contrib/torch/optim/rprop.lua create mode 100644 contrib/torch/optim/sgd.lua diff --git a/CMakeLists.txt b/CMakeLists.txt index 490e42215..331674723 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1259,6 +1259,7 @@ IF(ENABLE_TORCH MATCHES "ON") ADD_SUBDIRECTORY(contrib/torch/paths) ADD_SUBDIRECTORY(contrib/torch/torch7) ADD_SUBDIRECTORY(contrib/torch/nn) + ADD_SUBDIRECTORY(contrib/torch/optim) ADD_SUBDIRECTORY(contrib/torch/decisiontree) SET(WITH_TORCH 1) ELSE() diff --git a/contrib/torch/optim/CMakeLists.txt b/contrib/torch/optim/CMakeLists.txt new file mode 100644 index 000000000..b1c13e701 --- /dev/null +++ b/contrib/torch/optim/CMakeLists.txt @@ -0,0 +1,5 @@ + +CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) +SET(src) +FILE(GLOB luasrc *.lua) +ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}") diff --git a/contrib/torch/optim/COPYRIGHT.txt b/contrib/torch/optim/COPYRIGHT.txt new file mode 100644 index 000000000..2e4118c0f --- /dev/null +++ b/contrib/torch/optim/COPYRIGHT.txt @@ -0,0 +1,35 @@ +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of NEC Laboratories American and IDIAP Research + Institute nor the names of its contributors may be used to endorse or + promote products derived from this software without specific prior + written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/torch/optim/ConfusionMatrix.lua b/contrib/torch/optim/ConfusionMatrix.lua new file mode 100644 index 000000000..ec5302c24 --- /dev/null +++ b/contrib/torch/optim/ConfusionMatrix.lua @@ -0,0 +1,361 @@ +--[[ A Confusion Matrix class + +Example: + + conf = optim.ConfusionMatrix( {'cat','dog','person'} ) -- new matrix + conf:zero() -- reset matrix + for i = 1,N do + conf:add( neuralnet:forward(sample), label ) -- accumulate errors + end + print(conf) -- print matrix + image.display(conf:render()) -- render matrix +]] +local ConfusionMatrix = torch.class('optim.ConfusionMatrix') + +function ConfusionMatrix:__init(nclasses, classes) + if type(nclasses) == 'table' then + classes = nclasses + nclasses = #classes + end + self.mat = torch.LongTensor(nclasses,nclasses):zero() + self.valids = torch.FloatTensor(nclasses):zero() + self.unionvalids = torch.FloatTensor(nclasses):zero() + self.nclasses = nclasses + self.totalValid = 0 + self.averageValid = 0 + self.classes = classes or {} + -- buffers + self._mat_flat = self.mat:view(-1) + self._target = torch.FloatTensor() + self._prediction = torch.FloatTensor() + self._max = torch.FloatTensor() + self._pred_idx = torch.LongTensor() + self._targ_idx = torch.LongTensor() +end + +-- takes scalar prediction and target as input +function ConfusionMatrix:_add(p, t) + assert(p and type(p) == 'number') + assert(t and type(t) == 'number') + -- non-positive values are considered missing + -- and therefore ignored + if t > 0 then + self.mat[t][p] = self.mat[t][p] + 1 + end +end + +function ConfusionMatrix:add(prediction, target) + if type(prediction) == 'number' then + -- comparing numbers + self:_add(prediction, target) + else + self._prediction:resize(prediction:size()):copy(prediction) + assert(prediction:dim() == 1) + if type(target) == 'number' then + -- prediction is a vector, then target assumed to be an index + self._max:max(self._pred_idx, self._prediction, 1) + self:_add(self._pred_idx[1], target) + else + -- both prediction and target are vectors + assert(target:dim() == 1) + self._target:resize(target:size()):copy(target) + self._max:max(self._targ_idx, self._target, 1) + self._max:max(self._pred_idx, self._prediction, 1) + self:_add(self._pred_idx[1], self._targ_idx[1]) + end + end +end + +function ConfusionMatrix:batchAdd(predictions, targets) + local preds, targs, __ + self._prediction:resize(predictions:size()):copy(predictions) + if predictions:dim() == 1 then + -- predictions is a vector of classes + preds = self._prediction + elseif predictions:dim() == 2 then + -- prediction is a matrix of class likelihoods + if predictions:size(2) == 1 then + -- or prediction just needs flattening + preds = self._prediction:select(2,1) + else + self._max:max(self._pred_idx, self._prediction, 2) + preds = self._pred_idx:select(2,1) + end + else + error("predictions has invalid number of dimensions") + end + + self._target:resize(targets:size()):copy(targets) + if targets:dim() == 1 then + -- targets is a vector of classes + targs = self._target + elseif targets:dim() == 2 then + -- targets is a matrix of one-hot rows + if targets:size(2) == 1 then + -- or targets just needs flattening + targs = self._target:select(2,1) + else + self._max:max(self._targ_idx, self._target, 2) + targs = self._targ_idx:select(2,1) + end + else + error("targets has invalid number of dimensions") + end + + -- non-positive values are considered missing and therefore ignored + local mask = targs:ge(1) + targs = targs[mask] + preds = preds[mask] + + self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility + + preds = preds:typeAs(targs) + + assert(self.mat:isContiguous() and self.mat:stride(2) == 1) + local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat) + local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1)) + self._mat_flat:indexAdd(1, indices, ones) +end + +function ConfusionMatrix:zero() + self.mat:zero() + self.valids:zero() + self.unionvalids:zero() + self.totalValid = 0 + self.averageValid = 0 +end + +local function isNaN(number) + return number ~= number +end + +function ConfusionMatrix:updateValids() + local total = 0 + for t = 1,self.nclasses do + self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum() + self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t]) + total = total + self.mat[t][t] + end + self.totalValid = total / self.mat:sum() + self.averageValid = 0 + self.averageUnionValid = 0 + local nvalids = 0 + local nunionvalids = 0 + for t = 1,self.nclasses do + if not isNaN(self.valids[t]) then + self.averageValid = self.averageValid + self.valids[t] + nvalids = nvalids + 1 + end + if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then + self.averageUnionValid = self.averageUnionValid + self.unionvalids[t] + nunionvalids = nunionvalids + 1 + end + end + self.averageValid = self.averageValid / nvalids + self.averageUnionValid = self.averageUnionValid / nunionvalids +end + +-- Calculating FAR/FRR associated with the confusion matrix + +function ConfusionMatrix:farFrr() + local cmat = self.mat + local noOfClasses = cmat:size()[1] + self._frrs = self._frrs or torch.zeros(noOfClasses) + self._frrs:zero() + self._classFrrs = self._classFrrs or torch.zeros(noOfClasses) + self._classFrrs:zero() + self._classFrrs:add(-1) + self._fars = self._fars or torch.zeros(noOfClasses) + self._fars:zero() + self._classFars = self._classFars or torch.zeros(noOfClasses) + self._classFars:zero() + self._classFars:add(-1) + local classSamplesCount = cmat:sum(2) + local indx = 1 + for i=1,noOfClasses do + if classSamplesCount[i][1] ~= 0 then + self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1] + self._classFrrs[i] = self._frrs[indx] + -- Calculating FARs + local farNumerator = 0 + local farDenominator = 0 + for j=1, noOfClasses do + if i ~= j then + if classSamplesCount[j][1] ~= 0 then + farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1] + farDenominator = farDenominator + 1 + end + end + end + self._fars[indx] = farNumerator/farDenominator + self._classFars[i] = self._fars[indx] + indx = indx + 1 + end + end + indx = indx - 1 + local returnFrrs = self._frrs[{{1, indx}}] + local returnFars = self._fars[{{1, indx}}] + return self._classFrrs, self._classFars, returnFrrs, returnFars +end + +local function log10(n) + if math.log10 then + return math.log10(n) + else + return math.log(n) / math.log(10) + end +end + +function ConfusionMatrix:__tostring__() + self:updateValids() + local str = {'ConfusionMatrix:\n'} + local nclasses = self.nclasses + table.insert(str, '[') + local maxCnt = self.mat:max() + local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt))) + for t = 1,nclasses do + local pclass = self.valids[t] * 100 + pclass = string.format('%2.3f', pclass) + if t == 1 then + table.insert(str, '[') + else + table.insert(str, ' [') + end + for p = 1,nclasses do + table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p])) + end + if self.classes and self.classes[1] then + if t == nclasses then + table.insert(str, ']] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') + else + table.insert(str, '] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') + end + else + if t == nclasses then + table.insert(str, ']] ' .. pclass .. '% \n') + else + table.insert(str, '] ' .. pclass .. '% \n') + end + end + end + table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n') + table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n') + table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%') + return table.concat(str) +end + +function ConfusionMatrix:render(sortmode, display, block, legendwidth) + -- args + local confusion = self.mat:double() + local classes = self.classes + local sortmode = sortmode or 'score' -- 'score' or 'occurrence' + local block = block or 25 + local legendwidth = legendwidth or 200 + local display = display or false + + -- legends + local legend = { + ['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]', + ['occurrence'] = 'Confusion matrix [sorted by occurrences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]' + } + + -- parse matrix / normalize / count scores + local diag = torch.FloatTensor(#classes) + local freqs = torch.FloatTensor(#classes) + local unconf = confusion + local confusion = confusion:clone() + local corrects = 0 + local total = 0 + for target = 1,#classes do + freqs[target] = confusion[target]:sum() + corrects = corrects + confusion[target][target] + total = total + freqs[target] + confusion[target]:div( math.max(confusion[target]:sum(),1) ) + diag[target] = confusion[target][target] + end + + -- accuracies + local accuracy = corrects / total * 100 + local perclass = 0 + local total = 0 + for target = 1,#classes do + if confusion[target]:sum() > 0 then + perclass = perclass + diag[target] + total = total + 1 + end + end + perclass = perclass / total * 100 + freqs:div(unconf:sum()) + + -- sort matrix + if sortmode == 'score' then + _,order = torch.sort(diag,1,true) + elseif sortmode == 'occurrence' then + _,order = torch.sort(freqs,1,true) + else + error('sort mode must be one of: score | occurrence') + end + + -- render matrix + local render = torch.zeros(#classes*block, #classes*block) + for target = 1,#classes do + for prediction = 1,#classes do + render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]] + end + end + + -- add grid + for target = 1,#classes do + render[{ {target*block},{} }] = 0.1 + render[{ {},{target*block} }] = 0.1 + end + + -- create rendering + require 'image' + require 'qtwidget' + require 'qttorch' + local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] ) + image.display{image=render, win=win1} + + -- add legend + for i in ipairs(classes) do + -- background cell + win1:setcolor{r=0,g=0,b=0} + win1:rectangle((#render)[2],(i-1)*block,legendwidth,block) + win1:fill() + + -- % + win1:setfont(qt.QFont{serif=false, size=fontsize}) + local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4 + win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2} + win1:moveto((#render)[2]+10,i*block-block/3) + win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100)) + + -- legend + win1:setfont(qt.QFont{serif=false, size=fontsize}) + local gscale = diag[order[i]]*0.8+0.2 + win1:setcolor{r=gscale,g=gscale,b=gscale} + win1:moveto(120+(#render)[2]+10,i*block-block/3) + win1:show(classes[order[i]]) + + for j in ipairs(classes) do + -- scores + local score = confusion[order[j]][order[i]] + local gscale = (1-score)*(score*0.8+0.2) + win1:setcolor{r=gscale,g=gscale,b=gscale} + win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3) + win1:show(string.format('%02.0f',math.floor(score*100+0.5))) + end + end + + -- generate tensor + local t = win1:image():toTensor() + + -- display + if display then + image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)} + end + + -- return rendering + return t +end diff --git a/contrib/torch/optim/Logger.lua b/contrib/torch/optim/Logger.lua new file mode 100644 index 000000000..31928ecdf --- /dev/null +++ b/contrib/torch/optim/Logger.lua @@ -0,0 +1,190 @@ +--[[ Logger: a simple class to log symbols during training, + and automate plot generation + +Example: + logger = optim.Logger('somefile.log') -- file to save stuff + + for i = 1,N do -- log some symbols during + train_error = ... -- training/testing + test_error = ... + logger:add{['training error'] = train_error, + ['test error'] = test_error} + end + + logger:style{['training error'] = '-', -- define styles for plots + ['test error'] = '-'} + logger:plot() -- and plot + +---- OR --- + + logger = optim.Logger('somefile.log') -- file to save stuff + logger:setNames{'training error', 'test error'} + + for i = 1,N do -- log some symbols during + train_error = ... -- training/testing + test_error = ... + logger:add{train_error, test_error} + end + + logger:style{'-', '-'} -- define styles for plots + logger:plot() -- and plot + +----------- + + logger:setlogscale(true) -- enable logscale on Y-axis + logger:plot() -- and plot +]] +require 'xlua' +local Logger = torch.class('optim.Logger') + +function Logger:__init(filename, timestamp) + if filename then + self.name = filename + os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"') + if timestamp then + -- append timestamp to create unique log file + filename = filename .. '-'..os.date("%Y_%m_%d_%X") + end + self.file = io.open(filename,'w') + self.epsfile = self.name .. '.eps' + else + self.file = io.stdout + self.name = 'stdout' + print(' warning: no path provided, logging to std out') + end + self.empty = true + self.symbols = {} + self.styles = {} + self.names = {} + self.idx = {} + self.figure = nil + self.showPlot = true + self.plotRawCmd = nil + self.defaultStyle = '+' + self.logscale = false +end + +function Logger:setNames(names) + self.names = names + self.empty = false + self.nsymbols = #names + for k,key in pairs(names) do + self.file:write(key .. '\t') + self.symbols[k] = {} + self.styles[k] = {self.defaultStyle} + self.idx[key] = k + end + self.file:write('\n') + self.file:flush() + return self +end + +function Logger:add(symbols) + -- (1) first time ? print symbols' names on first row + if self.empty then + self.empty = false + self.nsymbols = #symbols + for k,val in pairs(symbols) do + self.file:write(k .. '\t') + self.symbols[k] = {} + self.styles[k] = {self.defaultStyle} + self.names[k] = k + end + self.idx = self.names + self.file:write('\n') + end + -- (2) print all symbols on one row + for k,val in pairs(symbols) do + if type(val) == 'number' then + self.file:write(string.format('%11.4e',val) .. '\t') + elseif type(val) == 'string' then + self.file:write(val .. '\t') + else + xlua.error('can only log numbers and strings', 'Logger') + end + end + self.file:write('\n') + self.file:flush() + -- (3) save symbols in internal table + for k,val in pairs(symbols) do + table.insert(self.symbols[k], val) + end +end + +function Logger:style(symbols) + for name,style in pairs(symbols) do + if type(style) == 'string' then + self.styles[name] = {style} + elseif type(style) == 'table' then + self.styles[name] = style + else + xlua.error('style should be a string or a table of strings','Logger') + end + end + return self +end + +function Logger:setlogscale(state) + self.logscale = state +end + +function Logger:display(state) + self.showPlot = state +end + +function Logger:plot(...) + if not xlua.require('gnuplot') then + if not self.warned then + print(' warning: cannot plot with this version of Torch') + self.warned = true + end + return + end + local plotit = false + local plots = {} + local plotsymbol = + function(name,list) + if #list > 1 then + local nelts = #list + local plot_y = torch.Tensor(nelts) + for i = 1,nelts do + plot_y[i] = list[i] + end + for _,style in ipairs(self.styles[name]) do + table.insert(plots, {self.names[name], plot_y, style}) + end + plotit = true + end + end + local args = {...} + if not args[1] then -- plot all symbols + for name,list in pairs(self.symbols) do + plotsymbol(name,list) + end + else -- plot given symbols + for _,name in ipairs(args) do + plotsymbol(self.idx[name], self.symbols[self.idx[name]]) + end + end + if plotit then + if self.showPlot then + self.figure = gnuplot.figure(self.figure) + if self.logscale then gnuplot.logscale('on') end + gnuplot.plot(plots) + if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end + gnuplot.grid('on') + gnuplot.title('') + end + if self.epsfile then + os.execute('rm -f "' .. self.epsfile .. '"') + local epsfig = gnuplot.epsfigure(self.epsfile) + if self.logscale then gnuplot.logscale('on') end + gnuplot.plot(plots) + if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end + gnuplot.grid('on') + gnuplot.title('') + gnuplot.plotflush() + gnuplot.close(epsfig) + end + end +end diff --git a/contrib/torch/optim/adadelta.lua b/contrib/torch/optim/adadelta.lua new file mode 100644 index 000000000..7cc058d29 --- /dev/null +++ b/contrib/torch/optim/adadelta.lua @@ -0,0 +1,55 @@ +--[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701 + +ARGS: +- `opfunc` : a function that takes a single input (X), the point of + evaluation, and returns f(X) and df/dX +- `x` : the initial point +- `config` : a table of hyper-parameters +- `config.rho` : interpolation parameter +- `config.eps` : for numerical stability +- `config.weightDecay` : weight decay +- `state` : a table describing the state of the optimizer; after each + call the state is modified +- `state.paramVariance` : vector of temporal variances of parameters +- `state.accDelta` : vector of accummulated delta of gradients +RETURN: +- `x` : the new x vector +- `f(x)` : the function, evaluated before the update +]] +function optim.adadelta(opfunc, x, config, state) + -- (0) get/update state + if config == nil and state == nil then + print('no state table, ADADELTA initializing') + end + local config = config or {} + local state = state or config + local rho = config.rho or 0.9 + local eps = config.eps or 1e-6 + local wd = config.weightDecay or 0 + state.evalCounter = state.evalCounter or 0 + -- (1) evaluate f(x) and df/dx + local fx,dfdx = opfunc(x) + + -- (2) weight decay + if wd ~= 0 then + dfdx:add(wd, x) + end + + -- (3) parameter update + if not state.paramVariance then + state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() + state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() + state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() + state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() + end + state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx) + state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt() + state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx) + x:add(-1, state.delta) + state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta) + -- (4) update evaluation counter + state.evalCounter = state.evalCounter + 1 + + -- return x*, f(x) before optimization + return x,{fx} +end diff --git a/contrib/torch/optim/adagrad.lua b/contrib/torch/optim/adagrad.lua new file mode 100644 index 000000000..6860c4317 --- /dev/null +++ b/contrib/torch/optim/adagrad.lua @@ -0,0 +1,55 @@ +--[[ ADAGRAD implementation for SGD + +ARGS: +- `opfunc` : a function that takes a single input (X), the point of + evaluation, and returns f(X) and df/dX +- `x` : the initial point +- `state` : a table describing the state of the optimizer; after each + call the state is modified +- `state.learningRate` : learning rate +- `state.paramVariance` : vector of temporal variances of parameters +- `state.weightDecay` : scalar that controls weight decay +RETURN: +- `x` : the new x vector +- `f(x)` : the function, evaluated before the update + +]] +function optim.adagrad(opfunc, x, config, state) + -- (0) get/update state + if config == nil and state == nil then + print('no state table, ADAGRAD initializing') + end + local config = config or {} + local state = state or config + local lr = config.learningRate or 1e-3 + local lrd = config.learningRateDecay or 0 + local wd = config.weightDecay or 0 + state.evalCounter = state.evalCounter or 0 + local nevals = state.evalCounter + + -- (1) evaluate f(x) and df/dx + local fx,dfdx = opfunc(x) + + -- (2) weight decay with a single parameter + if wd ~= 0 then + dfdx:add(wd, x) + end + + -- (3) learning rate decay (annealing) + local clr = lr / (1 + nevals*lrd) + + -- (4) parameter update with single or individual learning rates + if not state.paramVariance then + state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() + state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx) + end + state.paramVariance:addcmul(1,dfdx,dfdx) + state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt() + x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10)) + + -- (5) update evaluation counter + state.evalCounter = state.evalCounter + 1 + + -- return x*, f(x) before optimization + return x,{fx} +end diff --git a/contrib/torch/optim/adam.lua b/contrib/torch/optim/adam.lua new file mode 100644 index 000000000..2e127e96a --- /dev/null +++ b/contrib/torch/optim/adam.lua @@ -0,0 +1,72 @@ +--[[ An implementation of Adam https://arxiv.org/abs/1412.6980 + +ARGS: + +- 'opfunc' : a function that takes a single input (X), the point + of a evaluation, and returns f(X) and df/dX +- 'x' : the initial point +- 'config` : a table with configuration parameters for the optimizer +- 'config.learningRate' : learning rate +- `config.learningRateDecay` : learning rate decay +- 'config.beta1' : first moment coefficient +- 'config.beta2' : second moment coefficient +- 'config.epsilon' : for numerical stability +- 'config.weightDecay' : weight decay +- 'state' : a table describing the state of the optimizer; after each + call the state is modified + +RETURN: +- `x` : the new x vector +- `f(x)` : the function, evaluated before the update + +]] + +function optim.adam(opfunc, x, config, state) + -- (0) get/update state + local config = config or {} + local state = state or config + local lr = config.learningRate or 0.001 + local lrd = config.learningRateDecay or 0 + + local beta1 = config.beta1 or 0.9 + local beta2 = config.beta2 or 0.999 + local epsilon = config.epsilon or 1e-8 + local wd = config.weightDecay or 0 + + -- (1) evaluate f(x) and df/dx + local fx, dfdx = opfunc(x) + + -- (2) weight decay + if wd ~= 0 then + dfdx:add(wd, x) + end + + -- Initialization + state.t = state.t or 0 + -- Exponential moving average of gradient values + state.m = state.m or x.new(dfdx:size()):zero() + -- Exponential moving average of squared gradient values + state.v = state.v or x.new(dfdx:size()):zero() + -- A tmp tensor to hold the sqrt(v) + epsilon + state.denom = state.denom or x.new(dfdx:size()):zero() + + -- (3) learning rate decay (annealing) + local clr = lr / (1 + state.t*lrd) + + state.t = state.t + 1 + + -- Decay the first and second moment running average coefficient + state.m:mul(beta1):add(1-beta1, dfdx) + state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx) + + state.denom:copy(state.v):sqrt():add(epsilon) + + local biasCorrection1 = 1 - beta1^state.t + local biasCorrection2 = 1 - beta2^state.t + local stepSize = clr * math.sqrt(biasCorrection2)/biasCorrection1 + -- (4) update x + x:addcdiv(-stepSize, state.m, state.denom) + + -- return x*, f(x) before optimization + return x, {fx} +end diff --git a/contrib/torch/optim/adamax.lua b/contrib/torch/optim/adamax.lua new file mode 100644 index 000000000..2b6487720 --- /dev/null +++ b/contrib/torch/optim/adamax.lua @@ -0,0 +1,66 @@ +--[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf + +ARGS: + +- 'opfunc' : a function that takes a single input (X), the point + of a evaluation, and returns f(X) and df/dX +- 'x' : the initial point +- 'config` : a table with configuration parameters for the optimizer +- 'config.learningRate' : learning rate +- 'config.beta1' : first moment coefficient +- 'config.beta2' : second moment coefficient +- 'config.epsilon' : for numerical stability +- 'state' : a table describing the state of the optimizer; + after each call the state is modified. + +RETURN: +- `x` : the new x vector +- `f(x)` : the function, evaluated before the update + +]] + +function optim.adamax(opfunc, x, config, state) + -- (0) get/update state + local config = config or {} + local state = state or config + local lr = config.learningRate or 0.002 + + local beta1 = config.beta1 or 0.9 + local beta2 = config.beta2 or 0.999 + local epsilon = config.epsilon or 1e-38 + local wd = config.weightDecay or 0 + + -- (1) evaluate f(x) and df/dx + local fx, dfdx = opfunc(x) + + -- (2) weight decay + if wd ~= 0 then + dfdx:add(wd, x) + end + + -- Initialization + state.t = state.t or 0 + -- Exponential moving average of gradient values + state.m = state.m or x.new(dfdx:size()):zero() + -- Exponential moving average of the infinity norm + state.u = state.u or x.new(dfdx:size()):zero() + -- A tmp tensor to hold the input to max() + state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero() + + state.t = state.t + 1 + + -- Update biased first moment estimate. + state.m:mul(beta1):add(1-beta1, dfdx) + -- Update the exponentially weighted infinity norm. + state.max[1]:copy(state.u):mul(beta2) + state.max[2]:copy(dfdx):abs():add(epsilon) + state.u:max(state.max, 1) + + local biasCorrection1 = 1 - beta1^state.t + local stepSize = lr/biasCorrection1 + -- (2) update x + x:addcdiv(-stepSize, state.m, state.u) + + -- return x*, f(x) before optimization + return x, {fx} +end diff --git a/contrib/torch/optim/asgd.lua b/contrib/torch/optim/asgd.lua new file mode 100644 index 000000000..cc1c459f3 --- /dev/null +++ b/contrib/torch/optim/asgd.lua @@ -0,0 +1,73 @@ +--[[ An implementation of ASGD + +ASGD: + + x := (1 - lambda eta_t) x - eta_t df/dx(z,x) + a := a + mu_t [ x - a ] + + eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75 + mu_t = 1/max(1,t-t0) + +implements ASGD algoritm as in L.Bottou's sgd-2.0 + +ARGS: + +- `opfunc` : a function that takes a single input (X), the point of + evaluation, and returns f(X) and df/dX +- `x` : the initial point +- `state` : a table describing the state of the optimizer; after each + call the state is modified +- `state.eta0` : learning rate +- `state.lambda` : decay term +- `state.alpha` : power for eta update +- `state.t0` : point at which to start averaging + +RETURN: +- `x` : the new x vector +- `f(x)` : the function, evaluated before the update +- `ax` : the averaged x vector + +(Clement Farabet, 2012) +--]] +function optim.asgd(opfunc, x, config, state) + -- (0) get/update state + local config = config or {} + local state = state or config + config.eta0 = config.eta0 or 1e-4 + config.lambda = config.lambda or 1e-4 + config.alpha = config.alpha or 0.75 + config.t0 = config.t0 or 1e6 + + -- (hidden state) + state.eta_t = state.eta_t or config.eta0 + state.mu_t = state.mu_t or 1 + state.t = state.t or 0 + + -- (1) evaluate f(x) and df/dx + local fx,dfdx = opfunc(x) + + -- (2) decay term + x:mul(1 - config.lambda*state.eta_t) + + -- (3) update x + x:add(-state.eta_t, dfdx) + + -- (4) averaging + state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero() + state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax) + if state.mu_t ~= 1 then + state.tmp:copy(x) + state.tmp:add(-1,state.ax):mul(state.mu_t) + state.ax:add(state.tmp) + else + state.ax:copy(x) + end + + -- (5) update eta_t and mu_t + state.t = state.t + 1 + state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha) + state.mu_t = 1 / math.max(1, state.t - config.t0) + + -- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...) + return x,{fx},state.ax +end diff --git a/contrib/torch/optim/cg.lua b/contrib/torch/optim/cg.lua new file mode 100644 index 000000000..842a7d569 --- /dev/null +++ b/contrib/torch/optim/cg.lua @@ -0,0 +1,208 @@ +--[[ + +This cg implementation is a rewrite of minimize.m written by Carl +E. Rasmussen. It is supposed to produce exactly same results (give +or take numerical accuracy due to some changed order of +operations). You can compare the result on rosenbrock with minimize.m. +http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html + + [x fx c] = minimize([0 0]', 'rosenbrock', -25) + +Note that we limit the number of function evaluations only, it seems much +more important in practical use. + +ARGS: + +- `opfunc` : a function that takes a single input, the point of evaluation. +- `x` : the initial point +- `state` : a table of parameters and temporary allocations. +- `state.maxEval` : max number of function evaluations +- `state.maxIter` : max number of iterations +- `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage +- `state.[s,x0]` : if you pass torch.Tensor they will be used for temp storage + +RETURN: + +- `x*` : the new x vector, at the optimal point +- `f` : a table of all function values where + `f[1]` is the value of the function before any optimization and + `f[#f]` is the final fully optimized value, at x* + +(Koray Kavukcuoglu, 2012) +--]] +function optim.cg(opfunc, x, config, state) + -- parameters + local config = config or {} + local state = state or config + local rho = config.rho or 0.01 + local sig = config.sig or 0.5 + local int = config.int or 0.1 + local ext = config.ext or 3.0 + local maxIter = config.maxIter or 20 + local ratio = config.ratio or 100 + local maxEval = config.maxEval or maxIter*1.25 + local red = 1 + + local verbose = config.verbose or 0 + + local i = 0 + local ls_failed = 0 + local fx = {} + + -- we need three points for the interpolation/extrapolation stuff + local z1,z2,z3 = 0,0,0 + local d1,d2,d3 = 0,0,0 + local f1,f2,f3 = 0,0,0 + + local df1 = state.df1 or x.new() + local df2 = state.df2 or x.new() + local df3 = state.df3 or x.new() + local tdf + + df1:resizeAs(x) + df2:resizeAs(x) + df3:resizeAs(x) + + -- search direction + local s = state.s or x.new() + s:resizeAs(x) + + -- we need a temp storage for X + local x0 = state.x0 or x.new() + local f0 = 0 + local df0 = state.df0 or x.new() + x0:resizeAs(x) + df0:resizeAs(x) + + -- evaluate at initial point + f1,tdf = opfunc(x) + fx[#fx+1] = f1 + df1:copy(tdf) + i=i+1 + + -- initial search direction + s:copy(df1):mul(-1) + + d1 = -s:dot(s ) -- slope + z1 = red/(1-d1) -- initial step + + while i < math.abs(maxEval) do + + x0:copy(x) + f0 = f1 + df0:copy(df1) + + x:add(z1,s) + f2,tdf = opfunc(x) + df2:copy(tdf) + i=i+1 + d2 = df2:dot(s) + f3,d3,z3 = f1,d1,-z1 -- init point 3 equal to point 1 + local m = math.min(maxIter,maxEval-i) + local success = 0 + local limit = -1 + + while true do + while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do + limit = z1 + if f2 > f1 then + z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3) + else + local A = 6*(f2-f3)/z3+3*(d2+d3) + local B = 3*(f3-f2)-z3*(d3+2*d2) + z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A + end + if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then + z2 = z3/2; + end + z2 = math.max(math.min(z2, int*z3),(1-int)*z3); + z1 = z1 + z2; + x:add(z2,s) + f2,tdf = opfunc(x) + df2:copy(tdf) + i=i+1 + m = m - 1 + d2 = df2:dot(s) + z3 = z3-z2; + end + if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then + break + elseif d2 > sig*d1 then + success = 1; + break; + elseif m == 0 then + break; + end + local A = 6*(f2-f3)/z3+3*(d2+d3); + local B = 3*(f3-f2)-z3*(d3+2*d2); + z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3)) + + if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then + if limit < -0.5 then + z2 = z1 * (ext -1) + else + z2 = (limit-z1)/2 + end + elseif (limit > -0.5) and (z2+z1) > limit then + z2 = (limit-z1)/2 + elseif limit < -0.5 and (z2+z1) > z1*ext then + z2 = z1*(ext-1) + elseif z2 < -z3*int then + z2 = -z3*int + elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then + z2 = (limit-z1)*(1-int) + end + f3=f2; d3=d2; z3=-z2; + z1 = z1+z2; + x:add(z2,s) + + f2,tdf = opfunc(x) + df2:copy(tdf) + i=i+1 + m = m - 1 + d2 = df2:dot(s) + end + if success == 1 then + f1 = f2 + fx[#fx+1] = f1; + local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1) + s:mul(ss) + s:add(-1,df2) + local tmp = df1:clone() + df1:copy(df2) + df2:copy(tmp) + d2 = df1:dot(s) + if d2> 0 then + s:copy(df1) + s:mul(-1) + d2 = -s:dot(s) + end + + z1 = z1 * math.min(ratio, d1/(d2-1e-320)) + d1 = d2 + ls_failed = 0 + else + x:copy(x0) + f1 = f0 + df1:copy(df0) + if ls_failed or i>maxEval then + break + end + local tmp = df1:clone() + df1:copy(df2) + df2:copy(tmp) + s:copy(df1) + s:mul(-1) + d1 = -s:dot(s) + z1 = 1/(1-d1) + ls_failed = 1 + end + end + state.df0 = df0 + state.df1 = df1 + state.df2 = df2 + state.df3 = df3 + state.x0 = x0 + state.s = s + return x,fx,i +end diff --git a/contrib/torch/optim/checkgrad.lua b/contrib/torch/optim/checkgrad.lua new file mode 100644 index 000000000..0382b2132 --- /dev/null +++ b/contrib/torch/optim/checkgrad.lua @@ -0,0 +1,52 @@ +--[[ An implementation of a simple numerical gradient checker. + +ARGS: + +- `opfunc` : a function that takes a single input (X), the point of + evaluation, and returns f(X) and df/dX +- `x` : the initial point +- `eps` : the epsilon to use for the numerical check (default is 1e-7) + +RETURN: + +- `diff` : error in the gradient, should be near tol +- `dC` : exact gradient at point +- `dC_est` : numerically estimates gradient at point + +]]-- + + +-- function that numerically checks gradient of NCA loss: +function optim.checkgrad(opfunc, x, eps) + + -- compute true gradient: + local Corg,dC = opfunc(x) + dC:resize(x:size()) + + local Ctmp -- temporary value + local isTensor = torch.isTensor(Corg) + if isTensor then + Ctmp = Corg.new(Corg:size()) + end + + -- compute numeric approximations to gradient: + local eps = eps or 1e-7 + local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC) + for i = 1,dC:size(1) do + local tmp = x[i] + x[i] = x[i] + eps + local C1 = opfunc(x) + if isTensor then + Ctmp:copy(C1) + C1 = Ctmp + end + x[i] = x[i] - 2 * eps + local C2 = opfunc(x) + x[i] = tmp + dC_est[i] = (C1 - C2) / (2 * eps) + end + + -- estimate error of gradient: + local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est) + return diff,dC,dC_est +end diff --git a/contrib/torch/optim/cmaes.lua b/contrib/torch/optim/cmaes.lua new file mode 100644 index 000000000..74cd58a0c --- /dev/null +++ b/contrib/torch/optim/cmaes.lua @@ -0,0 +1,270 @@ +require 'torch' +require 'math' + +local BestSolution = {} +--[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), +ported from https://www.lri.fr/~hansen/barecmaes2.html. + +Parameters +---------- +ARGS: + +- `opfunc` : a function that takes a single input (X), the point of + evaluation, and returns f(X) and df/dX. Note that df/dX is not used +- `x` : the initial point +- `state.sigma` + float, initial step-size (standard deviation in each + coordinate) +- `state.maxEval` + int, maximal number of function evaluations +- `state.ftarget` + float, target function value +- `state.popsize` + population size. If this is left empty, + 4 + int(3 * log(|x|)) will be used +- `state.ftarget` + stop if fitness < ftarget +- `state.verb_disp` + int, display on console every verb_disp iteration, 0 for never + +RETURN: +- `x*` : the new `x` vector, at the optimal point +- `f` : a table of all function values: + `f[1]` is the value of the function before any optimization and + `f[#f]` is the final fully optimized value, at `x*` +--]] +function optim.cmaes(opfunc, x, config, state) + if (x.triu == nil or x.diag == nil) then + error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x") + end + -- process input parameters + local config = config or {} + local state = state or config + local xmean = x:clone():view(-1) -- distribution mean, a flattened copy + local N = xmean:size(1) -- number of objective variables/problem dimension + local sigma = state.sigma -- coordinate wise standard deviation (step size) + local ftarget = state.ftarget -- stop if fitness < ftarget + local maxEval = tonumber(state.maxEval) or 1e3*N^2 + local objfunc = opfunc + local verb_disp = state.verb_disp -- display step size + local min_iterations = state.min_iterations or 1 + + local lambda = state.popsize -- population size, offspring number + -- Strategy parameter setting: Selection + if state.popsize == nil then + lambda = 4 + math.floor(3 * math.log(N)) + end + + local mu = lambda / 2 -- number of parents/points for recombination + local weights = torch.range(0,mu-1):apply(function(i) + return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights + weights:div(weights:sum()) -- normalize recombination weights array + local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i + weights = weights:typeAs(x) + + -- Strategy parameter setting: Adaptation + local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N) -- time constant for cumulation for C + local cs = (mueff + 2) / (N + mueff + 5) -- t-const for cumulation for sigma control + local c1 = 2 / ((N + 1.3)^2 + mueff) -- learning rate for rank-one update of C + local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update + local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1 + + -- Initialize dynamic (internal) state variables + local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C + local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma + local B = torch.eye(N):typeAs(x) -- B defines the coordinate system + local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling + local C = torch.eye(N):typeAs(x) -- covariance matrix + if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig + error('torch.symeig not available for ' .. x:type() .. + " please use Float- or DoubleTensor for x") + end + local candidates = torch.Tensor(lambda,N):typeAs(x) + local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2 + local eigeneval = 0 -- tracking the update of B and D + local counteval = 0 + local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination + local fitvals = torch.Tensor(lambda)-- fitness values + local best = BestSolution.new(nil,nil,counteval) + local iteration = 0 -- iteration of the optimize loop + + + local function ask() + --[[return a list of lambda candidate solutions according to + m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I) + --]] + -- Eigendecomposition: first update B, D and invsqrtC from C + -- postpone in case to achieve O(N^2) + if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then + eigeneval = counteval + C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry + D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3) + D = torch.sqrt(D) -- D contains standard deviations now + invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t()) + end + for k=1,lambda do --repeat lambda times + local z = D:clone():normal(0,1):cmul(D) + candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma) + end + + return candidates + end + + + local function tell(arx) + --[[update the evolution paths and the distribution parameters m, + sigma, and C within CMA-ES. + + Parameters + ---------- + `arx` + a list of solutions, presumably from `ask()` + `fitvals` + the corresponding objective function values --]] + -- bookkeeping, preparation + counteval = counteval + lambda -- slightly artificial to do here + local xold = xmean:clone() + + -- Sort by fitness and compute weighted mean into xmean + local arindex = nil --sorted indices + fitvals, arindex = torch.sort(fitvals) + arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions + + table.insert(f_hist, fitvals[1]) --append best fitness to history + best:update(arx[1], fitvals[1], counteval) + + xmean:zero() + xmean:addmv(arx:t(), weights) --dot product + + -- Cumulation: update evolution paths + local y = xmean - xold + local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold) + + local c = (cs * (2-cs) * mueff)^0.5 / sigma + ps = ps - ps * cs + z * c -- exponential decay on ps + local hsig = (torch.sum(torch.pow(ps,2)) / + (1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1)) + hsig = hsig and 1.0 or 0.0 --use binary numbers + + c = (cc * (2-cc) * mueff)^0.5 / sigma + pc = pc - pc * cc + y * c * hsig -- exponential decay on pc + + -- Adapt covariance matrix C + local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc) + -- for a minor adjustment to the variance loss by hsig + for i=1,N do + for j=1,N do + local r = torch.range(1,mu) + r:apply(function(k) + return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end) + local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update + C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] + + c1 * pc[i]*pc[j] + cmu * Cmuij) + end + end + + -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82 + sigma = sigma * math.exp(math.min(0.6, + (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2)) + end + + local function stop() + --[[return satisfied termination conditions in a table like + {'termination reason':value, ...}, for example {'tolfun':1e-12}, + or the empty table {}--]] + local res = {} + if counteval > 0 then + if counteval >= maxEval then + res['evals'] = maxEval + end + if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then + res['ftarget'] = ftarget + end + if torch.max(D) > 1e7 * torch.min(D) then + res['condition'] = 1e7 + end + if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then + res['tolfun'] = 1e-12 + end + if sigma * torch.max(D) < 1e-11 then + -- remark: max(D) >= max(diag(C))^0.5 + res['tolx'] = 1e-11 + end + end + return res + end + + local function disp(verb_modulo) + --[[display some iteration info--]] + if verb_disp == 0 then + return nil + end + local iteration = counteval / lambda + + if iteration == 1 or iteration % (10*verb_modulo) == 0 then + print('evals:\t ax-ratio max(std) f-value') + end + if iteration <= 2 or iteration % verb_modulo == 0 then + local max_std = math.sqrt(torch.max(torch.diag(C))) + print(tostring(counteval).. ': ' .. + string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std) + .. tostring(fitvals[1])) + end + + return nil + end + + while next(stop()) == nil or iteration < min_iterations do + iteration = iteration + 1 + + local X = ask() -- deliver candidate solutions + for i=1, lambda do + -- put candidate tensor back in input shape and evaluate in opfunc + local candidate = X[i]:viewAs(x) + fitvals[i] = objfunc(candidate) + end + + tell(X) + disp(verb_disp) + end + + local bestmu, f, c = best:get() + if verb_disp > 0 then + for k, v in pairs(stop()) do + print('termination by', k, '=', v) + end + print('best f-value =', f) + print('solution = ') + print(bestmu) + print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration) + end + table.insert(f_hist, f) + + return bestmu, f_hist, counteval +end + + + +BestSolution.__index = BestSolution +function BestSolution.new(x, f, evals) + local self = setmetatable({}, BestSolution) + self.x = x + self.f = f + self.evals = evals + return self +end + +function BestSolution.update(self, arx, arf, evals) + --[[initialize the best solution with `x`, `f`, and `evals`. + Better solutions have smaller `f`-values.--]] + if self.f == nil or arf < self.f then + self.x = arx:clone() + self.f = arf + self.evals = evals + end + return self +end + +function BestSolution.get(self) + return self.x, self.f, self.evals +end diff --git a/contrib/torch/optim/de.lua b/contrib/torch/optim/de.lua new file mode 100644 index 000000000..1e8e8001d --- /dev/null +++ b/contrib/torch/optim/de.lua @@ -0,0 +1,109 @@ +--[[ An implementation of `DE` (Differential Evolution), + +ARGS: + + -`opfunc` : a function that takes a single input (X), the point of + evaluation, and returns f(X) and df/dX. Note that df/dX is not used + -`x` : the initial point + -`state.popsize`: population size. If this is left empty, 10*d will be used + -`state.scaleFactor`: float, usually between 0.4 and 1 + -`state.crossoverRate`: float, usually between 0.1 and 0.9 + -`state.maxEval`: int, maximal number of function evaluations + +RETURN: + - `x*` : the new `x` vector, at the optimal point + - `f` : a table of all function values: + `f[1]` is the value of the function before any optimization and + `f[#f]` is the final fully optimized value, at `x*` +]] + +require 'torch' + +function optim.de(opfunc, x, config, state) + -- process input parameters + local config = config or {} + local state = state + local popsize = config.popsize -- population size + local scaleFactor = config.scaleFactor -- scale factor + local crossoverRate = config.crossoverRate -- crossover rate + local maxFEs = tonumber(config.maxFEs) -- maximal number of function evaluations + local maxRegion = config.maxRegion -- upper bound of search region + local minRegion = config.minRegion -- lower bound of search region + local xmean = x:clone():view(-1) -- distribution mean, a flattened copy + local D = xmean:size(1) -- number of objective variables/problem dimension + + if config.popsize == nil then + popsize = 10 * D + end + if config.maxRegion == nil then + maxRegion = 30 + end + if config.minRegion == nil then + minRegion = -30 + end + + -- Initialize population + local fx = x.new(maxFEs) + local pop = x.new(popsize, D) + local children = x.new(popsize, D) + local fitness = x.new(popsize) + local children_fitness = x.new(popsize) + local fes = 1 -- number of function evaluations + local best_fitness + local best_solution = x.new(D) + + -- Initialize population and evaluate the its fitness value + local gen = torch.Generator() + torch.manualSeed(gen, 1) + + pop:uniform(gen, minRegion, maxRegion) + for i = 1, popsize do + fitness[i] = opfunc(pop[i]) + fx[fes] = fitness[i] + fes = fes + 1 + end + + -- Find the best solution + local index + best_fitness, index = fitness:max(1) + best_fitness = best_fitness[1] + index = index[1] + best_solution:copy(pop[index]) + + -- Main loop + while fes < maxFEs do + local r1, r2 + for i = 1, popsize do + repeat + r1 = torch.random(gen, 1, popsize) + until(r1 ~= i) + repeat + r2 = torch.random(gen, 1, popsize) + until(r2 ~= r1 and r2 ~= i) + + local jrand = torch.random(gen, 1, D) + for j = 1, D do + if torch.uniform(gen, 0, 1) < crossoverRate or i == jrand then + children[i][j] = best_solution[j] + scaleFactor * (pop[r1][j] - pop[r2][j]) + else + children[i][j] = pop[i][j] + end + end + children_fitness[i] = opfunc(children[i]) + fx[fes] = children_fitness[i] + fes = fes + 1 + end + + for i = 1, popsize do + if children_fitness[i] <= fitness[i] then + pop[i]:copy(children[i]) + fitness[i] = children_fitness[i] + if fitness[i] < best_fitness then + best_fitness = fitness[i] + best_solution:copy(children[i]) + end + end + end + end + return best_solution, fx +end diff --git a/contrib/torch/optim/fista.lua b/contrib/torch/optim/fista.lua new file mode 100644 index 000000000..c8c6f5e43 --- /dev/null +++ b/contrib/torch/optim/fista.lua @@ -0,0 +1,192 @@ +--[[ FISTA with backtracking line search + +- `f` : smooth function +- `g` : non-smooth function +- `pl` : minimizer of intermediate problem Q(x,y) +- `xinit` : initial point +- `params` : table of parameters (**optional**) +- `params.L` : 1/(step size) for ISTA/FISTA iteration (0.1) +- `params.Lstep` : step size multiplier at each iteration (1.5) +- `params.maxiter` : max number of iterations (50) +- `params.maxline` : max number of line search iterations per iteration (20) +- `params.errthres`: Error thershold for convergence check (1e-4) +- `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true) +- `params.verbose` : store each iteration solution and print detailed info (false) + +On output, `params` will contain these additional fields that can be reused. + +- `params.L` : last used L value will be written. + +These are temporary storages needed by the algo and if the same params object is +passed a second time, these same storages will be used without new allocation. + +- `params.xkm` : previous iterarion point +- `params.y` : fista iteration +- `params.ply` : ply = pl(y - 1/L grad(f)) + +Returns the solution x and history of {function evals, number of line search ,...} + +Algorithm is published in + + @article{beck-fista-09, + Author = {Beck, Amir and Teboulle, Marc}, + Journal = {SIAM J. Img. Sci.}, + Number = {1}, + Pages = {183--202}, + Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems}, + Volume = {2}, + Year = {2009}} +]] +function optim.FistaLS(f, g, pl, xinit, params) + + local params = params or {} + local L = params.L or 0.1 + local Lstep = params.Lstep or 1.5 + local maxiter = params.maxiter or 50 + local maxline = params.maxline or 20 + local errthres = params.errthres or 1e-4 + local doFistaUpdate = params.doFistaUpdate + local verbose = params.verbose + + -- temporary allocations + params.xkm = params.xkm or torch.Tensor() + params.y = params.y or torch.Tensor() + params.ply = params.ply or torch.Tensor() + local xkm = params.xkm -- previous iteration + local y = params.y -- fista iteration + local ply = params.ply -- soft shrinked y + + -- we start from all zeros + local xk = xinit + xkm:resizeAs(xk):zero() + ply:resizeAs(xk):zero() + y:resizeAs(xk):zero() + + local history = {} -- keep track of stuff + local niter = 0 -- number of iterations done + local converged = false -- are we done? + local tk = 1 -- momentum param for FISTA + local tkp = 0 + + + local gy = g(y) + local fval = math.huge -- fval = f+g + while not converged and niter < maxiter do + + -- run through smooth function (code is input, input is target) + -- get derivatives from smooth function + local fy,gfy = f(y,'dx') + --local gfy = f(y) + + local fply = 0 + local gply = 0 + local Q = 0 + + ---------------------------------------------- + -- do line search to find new current location starting from fista loc + local nline = 0 + local linesearchdone = false + while not linesearchdone do + -- take a step in gradient direction of smooth function + ply:copy(y) + ply:add(-1/L,gfy) + + -- and solve for minimum of auxiliary problem + pl(ply,L) + -- this is candidate for new current iteration + xk:copy(ply) + + -- evaluate this point F(ply) + fply = f(ply) + + -- ply - y + ply:add(-1, y) + -- + local Q2 = gfy:dot(ply) + -- L/2 ||beta-y||^2 + local Q3 = L/2 * ply:dot(ply) + -- Q(beta,y) = F(y) + + L/2||beta-y||^2 + G(beta) + Q = fy + Q2 + Q3 + + if verbose then + print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3)) + end + -- check if F(beta) < Q(pl(y),\t) + if fply <= Q then --and Fply + Gply <= F then + -- now evaluate G here + linesearchdone = true + elseif nline >= maxline then + linesearchdone = true + xk:copy(xkm) -- if we can't find a better point, current iter = previous iter + --print('oops') + else + L = L * Lstep + end + nline = nline + 1 + end + -- end line search + --------------------------------------------- + + --------------------------------------------- + -- FISTA + --------------------------------------------- + if doFistaUpdate then + -- do the FISTA step + tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2 + -- x(k-1) = x(k-1) - x(k) + xkm:add(-1,xk) + -- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k)) + y:copy(xk) + y:add( (1-tk)/tkp , xkm) + -- store for next iterations + -- x(k-1) = x(k) + xkm:copy(xk) + else + y:copy(xk) + end + -- t(k) = t(k+1) + tk = tkp + fply = f(y) + gply = g(y) + if verbose then + print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply)) + end + + niter = niter + 1 + + -- bookeeping + fval = fply + gply + history[niter] = {} + history[niter].nline = nline + history[niter].L = L + history[niter].F = fval + history[niter].Fply = fply + history[niter].Gply = gply + history[niter].Q = Q + params.L = L + if verbose then + history[niter].xk = xk:clone() + history[niter].y = y:clone() + end + + -- are we done? + if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then + converged = true + xinit:copy(y) + return y,history + end + + if niter >= maxiter then + xinit:copy(y) + return y,history + end + + --if niter > 1 and history[niter].F > history[niter-1].F then + --print(niter, 'This was supposed to be a convex function, we are going up') + --converged = true + --return xk,history + --end + end + error('not supposed to be here') +end + diff --git a/contrib/torch/optim/init.lua b/contrib/torch/optim/init.lua new file mode 100644 index 000000000..a045bd8a2 --- /dev/null +++ b/contrib/torch/optim/init.lua @@ -0,0 +1,33 @@ + +require 'torch' + +optim = {} + +-- optimizations +require('optim.sgd') +require('optim.cg') +require('optim.asgd') +require('optim.nag') +require('optim.fista') +require('optim.lbfgs') +require('optim.adagrad') +require('optim.rprop') +require('optim.adam') +require('optim.adamax') +require('optim.rmsprop') +require('optim.adadelta') +require('optim.cmaes') +require('optim.de') + +-- line search functions +require('optim.lswolfe') + +-- helpers +require('optim.polyinterp') +require('optim.checkgrad') + +-- tools +require('optim.ConfusionMatrix') +require('optim.Logger') + +return optim diff --git a/contrib/torch/optim/lbfgs.lua b/contrib/torch/optim/lbfgs.lua new file mode 100644 index 000000000..d850fcbb3 --- /dev/null +++ b/contrib/torch/optim/lbfgs.lua @@ -0,0 +1,268 @@ +--[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt) + +This implementation of L-BFGS relies on a user-provided line +search function (state.lineSearch). If this function is not +provided, then a simple learningRate is used to produce fixed +size steps. Fixed size steps are much less costly than line +searches, and can be useful for stochastic problems. + +The learning rate is used even when a line search is provided. +This is also useful for large-scale stochastic problems, where +opfunc is a noisy approximation of f(x). In that case, the learning +rate allows a reduction of confidence in the step size. + +ARGS: + +- `opfunc` : a function that takes a single input (X), the point of + evaluation, and returns f(X) and df/dX +- `x` : the initial point +- `state` : a table describing the state of the optimizer; after each + call the state is modified +- `state.maxIter` : Maximum number of iterations allowed +- `state.maxEval` : Maximum number of function evaluations +- `state.tolFun` : Termination tolerance on the first-order optimality +- `state.tolX` : Termination tol on progress in terms of func/param changes +- `state.lineSearch` : A line search function +- `state.learningRate` : If no line search provided, then a fixed step size is used + +RETURN: +- `x*` : the new `x` vector, at the optimal point +- `f` : a table of all function values: + `f[1]` is the value of the function before any optimization and + `f[#f]` is the final fully optimized value, at `x*` + +(Clement Farabet, 2012) +]] +function optim.lbfgs(opfunc, x, config, state) + -- get/update state + local config = config or {} + local state = state or config + local maxIter = tonumber(config.maxIter) or 20 + local maxEval = tonumber(config.maxEval) or maxIter*1.25 + local tolFun = config.tolFun or 1e-5 + local tolX = config.tolX or 1e-9 + local nCorrection = config.nCorrection or 100 + local lineSearch = config.lineSearch + local lineSearchOpts = config.lineSearchOptions + local learningRate = config.learningRate or 1 + local isverbose = config.verbose or false + + state.funcEval = state.funcEval or 0 + state.nIter = state.nIter or 0 + + -- verbose function + local verbose + if isverbose then + verbose = function(...) print(' ', ...) end + else + verbose = function() end + end + + -- import some functions + local abs = math.abs + local min = math.min + + -- evaluate initial f(x) and df/dx + local f,g = opfunc(x) + local f_hist = {f} + local currentFuncEval = 1 + state.funcEval = state.funcEval + 1 + local p = g:size(1) + + -- check optimality of initial point + state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1 + tmp1:copy(g):abs() + if tmp1:sum() <= tolFun then + -- optimality condition below tolFun + verbose('optimality condition below tolFun') + return x,f_hist + end + + if not state.dir_bufs then + -- reusable buffers for y's and s's, and their histories + verbose('creating recyclable direction/step/history buffers') + state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1) + state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1) + for i=1,#state.dir_bufs do + state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1) + state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1) + end + end + + -- variables cached in state (for tracing) + local d = state.d + local t = state.t + local old_dirs = state.old_dirs + local old_stps = state.old_stps + local Hdiag = state.Hdiag + local g_old = state.g_old + local f_old = state.f_old + + -- optimize for a max of maxIter iterations + local nIter = 0 + while nIter < maxIter do + -- keep track of nb of iterations + nIter = nIter + 1 + state.nIter = state.nIter + 1 + + ------------------------------------------------------------ + -- compute gradient descent direction + ------------------------------------------------------------ + if state.nIter == 1 then + d = g:clone():mul(-1) -- -g + old_dirs = {} + old_stps = {} + Hdiag = 1 + else + -- do lbfgs update (update memory) + local y = table.remove(state.dir_bufs) -- pop + local s = table.remove(state.stp_bufs) + y:add(g, -1, g_old) -- g - g_old + s:mul(d, t) -- d*t + local ys = y:dot(s) -- y*s + if ys > 1e-10 then + -- updating memory + if #old_dirs == nCorrection then + -- shift history by one (limited-memory) + local removed1 = table.remove(old_dirs, 1) + local removed2 = table.remove(old_stps, 1) + table.insert(state.dir_bufs, removed1) + table.insert(state.stp_bufs, removed2) + end + + -- store new direction/step + table.insert(old_dirs, s) + table.insert(old_stps, y) + + -- update scale of initial Hessian approximation + Hdiag = ys / y:dot(y) -- (y*y) + else + -- put y and s back into the buffer pool + table.insert(state.dir_bufs, y) + table.insert(state.stp_bufs, s) + end + + -- compute the approximate (L-BFGS) inverse Hessian + -- multiplied by the gradient + local k = #old_dirs + + -- need to be accessed element-by-element, so don't re-type tensor: + state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro + for i = 1,k do + ro[i] = 1 / old_stps[i]:dot(old_dirs[i]) + end + + -- iteration in L-BFGS loop collapsed to use just one buffer + local q = tmp1 -- reuse tmp1 for the q buffer + -- need to be accessed element-by-element, so don't re-type tensor: + state.al = state.al or torch.zeros(nCorrection) local al = state.al + + q:mul(g, -1) -- -g + for i = k,1,-1 do + al[i] = old_dirs[i]:dot(q) * ro[i] + q:add(-al[i], old_stps[i]) + end + + -- multiply by initial Hessian + r = d -- share the same buffer, since we don't need the old d + r:mul(q, Hdiag) -- q[1] * Hdiag + for i = 1,k do + local be_i = old_stps[i]:dot(r) * ro[i] + r:add(al[i]-be_i, old_dirs[i]) + end + -- final direction is in r/d (same object) + end + g_old = g_old or g:clone() + g_old:copy(g) + f_old = f + + ------------------------------------------------------------ + -- compute step length + ------------------------------------------------------------ + -- directional derivative + local gtd = g:dot(d) -- g * d + + -- check that progress can be made along that direction + if gtd > -tolX then + break + end + + -- reset initial guess for step size + if state.nIter == 1 then + tmp1:copy(g):abs() + t = min(1,1/tmp1:sum()) * learningRate + else + t = learningRate + end + + -- optional line search: user function + local lsFuncEval = 0 + if lineSearch and type(lineSearch) == 'function' then + -- perform line search, using user function + f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts) + table.insert(f_hist, f) + else + -- no line search, simply move with fixed-step + x:add(t,d) + if nIter ~= maxIter then + -- re-evaluate function only if not in last iteration + -- the reason we do this: in a stochastic setting, + -- no use to re-evaluate that function here + f,g = opfunc(x) + lsFuncEval = 1 + table.insert(f_hist, f) + end + end + + -- update func eval + currentFuncEval = currentFuncEval + lsFuncEval + state.funcEval = state.funcEval + lsFuncEval + + ------------------------------------------------------------ + -- check conditions + ------------------------------------------------------------ + if nIter == maxIter then + -- no use to run tests + verbose('reached max number of iterations') + break + end + + if currentFuncEval >= maxEval then + -- max nb of function evals + verbose('max nb of function evals') + break + end + + tmp1:copy(g):abs() + if tmp1:sum() <= tolFun then + -- check optimality + verbose('optimality condition below tolFun') + break + end + + tmp1:copy(d):mul(t):abs() + if tmp1:sum() <= tolX then + -- step size below tolX + verbose('step size below tolX') + break + end + + if abs(f-f_old) < tolX then + -- function value changing less than tolX + verbose('function value changing less than tolX') + break + end + end + + -- save state + state.old_dirs = old_dirs + state.old_stps = old_stps + state.Hdiag = Hdiag + state.g_old = g_old + state.f_old = f_old + state.t = t + state.d = d + + -- return optimal x, and history of f(x) + return x,f_hist,currentFuncEval +end diff --git a/contrib/torch/optim/lswolfe.lua b/contrib/torch/optim/lswolfe.lua new file mode 100644 index 000000000..0afbdbe8b --- /dev/null +++ b/contrib/torch/optim/lswolfe.lua @@ -0,0 +1,192 @@ +--[[ A Line Search satisfying the Wolfe conditions + +ARGS: +- `opfunc` : a function (the objective) that takes a single input (X), + the point of evaluation, and returns f(X) and df/dX +- `x` : initial point / starting location +- `t` : initial step size +- `d` : descent direction +- `f` : initial function value +- `g` : gradient at initial location +- `gtd` : directional derivative at starting location +- `options.c1` : sufficient decrease parameter +- `options.c2` : curvature parameter +- `options.tolX` : minimum allowable step length +- `options.maxIter` : maximum nb of iterations + +RETURN: +- `f` : function value at x+t*d +- `g` : gradient value at x+t*d +- `x` : the next x (=x+t*d) +- `t` : the step length +- `lsFuncEval` : the number of function evaluations +]] +function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options) + -- options + options = options or {} + local c1 = options.c1 or 1e-4 + local c2 = options.c2 or 0.9 + local tolX = options.tolX or 1e-9 + local maxIter = options.maxIter or 20 + local isverbose = options.verbose or false + + -- some shortcuts + local abs = torch.abs + local min = math.min + local max = math.max + + -- verbose function + local function verbose(...) + if isverbose then print(' ', ...) end + end + + -- evaluate objective and gradient using initial step + local x_init = x:clone() + x:add(t,d) + local f_new,g_new = opfunc(x) + local lsFuncEval = 1 + local gtd_new = g_new * d + + -- bracket an interval containing a point satisfying the Wolfe + -- criteria + local LSiter,t_prev,done = 0,0,false + local f_prev,g_prev,gtd_prev = f,g:clone(),gtd + local bracket,bracketFval,bracketGval + while LSiter < maxIter do + -- check conditions: + if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then + bracket = x.new{t_prev,t} + bracketFval = x.new{f_prev,f_new} + bracketGval = x.new(2,g_new:size(1)) + bracketGval[1] = g_prev + bracketGval[2] = g_new + break + + elseif abs(gtd_new) <= -c2*gtd then + bracket = x.new{t} + bracketFval = x.new{f_new} + bracketGval = x.new(1,g_new:size(1)) + bracketGval[1] = g_new + done = true + break + + elseif gtd_new >= 0 then + bracket = x.new{t_prev,t} + bracketFval = x.new{f_prev,f_new} + bracketGval = x.new(2,g_new:size(1)) + bracketGval[1] = g_prev + bracketGval[2] = g_new + break + + end + + -- interpolate: + local tmp = t_prev + t_prev = t + local minStep = t + 0.01*(t-tmp) + local maxStep = t*10 + t = optim.polyinterp(x.new{{tmp,f_prev,gtd_prev}, + {t,f_new,gtd_new}}, + minStep, maxStep) + + -- next step: + f_prev = f_new + g_prev = g_new:clone() + gtd_prev = gtd_new + x[{}] = x_init + x:add(t,d) + f_new,g_new = opfunc(x) + lsFuncEval = lsFuncEval + 1 + gtd_new = g_new * d + LSiter = LSiter + 1 + end + + -- reached max nb of iterations? + if LSiter == maxIter then + bracket = x.new{0,t} + bracketFval = x.new{f,f_new} + bracketGval = x.new(2,g_new:size(1)) + bracketGval[1] = g + bracketGval[2] = g_new + end + + -- zoom phase: we now have a point satisfying the criteria, or + -- a bracket around it. We refine the bracket until we find the + -- exact point satisfying the criteria + local insufProgress = false + local LOposRemoved = 0 + while not done and LSiter < maxIter do + -- find high and low points in bracket + local f_LO,LOpos = bracketFval:min(1) + LOpos = LOpos[1] f_LO = f_LO[1] + local HIpos = -LOpos+3 + + -- compute new trial value + t = optim.polyinterp(x.new{{bracket[1],bracketFval[1],bracketGval[1]*d}, + {bracket[2],bracketFval[2],bracketGval[2]*d}}) + + -- test what we are making sufficient progress + if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then + if insufProgress or t>=bracket:max() or t <= bracket:min() then + if abs(t-bracket:max()) < abs(t-bracket:min()) then + t = bracket:max()-0.1*(bracket:max()-bracket:min()) + else + t = bracket:min()+0.1*(bracket:max()-bracket:min()) + end + insufProgress = false + else + insufProgress = true + end + else + insufProgress = false + end + + -- Evaluate new point + x[{}] = x_init + x:add(t,d) + f_new,g_new = opfunc(x) + lsFuncEval = lsFuncEval + 1 + gtd_new = g_new * d + LSiter = LSiter + 1 + if f_new > f + c1*t*gtd or f_new >= f_LO then + -- Armijo condition not satisfied or not lower than lowest point + bracket[HIpos] = t + bracketFval[HIpos] = f_new + bracketGval[HIpos] = g_new + else + if abs(gtd_new) <= - c2*gtd then + -- Wolfe conditions satisfied + done = true + elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then + -- Old HI becomes new LO + bracket[HIpos] = bracket[LOpos] + bracketFval[HIpos] = bracketFval[LOpos] + bracketGval[HIpos] = bracketGval[LOpos] + end + -- New point becomes new LO + bracket[LOpos] = t + bracketFval[LOpos] = f_new + bracketGval[LOpos] = g_new + end + + -- done? + if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then + break + end + end + + -- be verbose + if LSiter == maxIter then + verbose('reached max number of iterations') + end + + -- return stuff + local _,LOpos = bracketFval:min(1) + LOpos = LOpos[1] + t = bracket[LOpos] + f_new = bracketFval[LOpos] + g_new = bracketGval[LOpos] + x[{}] = x_init + x:add(t,d) + return f_new,g_new,x,t,lsFuncEval +end diff --git a/contrib/torch/optim/nag.lua b/contrib/torch/optim/nag.lua new file mode 100644 index 000000000..875d81e4c --- /dev/null +++ b/contrib/torch/optim/nag.lua @@ -0,0 +1,86 @@ +---------------------------------------------------------------------- +-- An implementation of SGD adapted with features of Nesterov's +-- Accelerated Gradient method, based on the paper +-- On the Importance of Initialization and Momentum in Deep Learning +-- Sutsveker et. al., ICML 2013 +-- +-- ARGS: +-- opfunc : a function that takes a single input (X), the point of +-- evaluation, and returns f(X) and df/dX +-- x : the initial point +-- state : a table describing the state of the optimizer; after each +-- call the state is modified +-- state.learningRate : learning rate +-- state.learningRateDecay : learning rate decay +-- state.weightDecay : weight decay +-- state.momentum : momentum +-- state.learningRates : vector of individual learning rates +-- +-- RETURN: +-- x : the new x vector +-- f(x) : the function, evaluated before the update +-- +-- (Dilip Krishnan, 2013) +-- + +function optim.nag(opfunc, x, config, state) + -- (0) get/update state + local config = config or {} + local state = state or config + local lr = config.learningRate or 1e-3 + local lrd = config.learningRateDecay or 0 + local wd = config.weightDecay or 0 + local mom = config.momentum or 0.9 + local damp = config.dampening or mom + local lrs = config.learningRates + state.evalCounter = state.evalCounter or 0 + local nevals = state.evalCounter + + if mom <= 0 then + error('Momentum must be positive for Nesterov Accelerated Gradient') + end + + -- (1) evaluate f(x) and df/dx + -- first step in the direction of the momentum vector + + if state.dfdx then + x:add(mom, state.dfdx) + end + -- then compute gradient at that point + -- comment out the above line to get the original SGD + local fx,dfdx = opfunc(x) + + -- (2) weight decay + if wd ~= 0 then + dfdx:add(wd, x) + end + + -- (3) learning rate decay (annealing) + local clr = lr / (1 + nevals*lrd) + + -- (4) apply momentum + if not state.dfdx then + state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0) + else + state.dfdx:mul(mom) + end + + -- (5) parameter update with single or individual learning rates + if lrs then + if not state.deltaParameters then + state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) + end + state.deltaParameters:copy(lrs):cmul(dfdx) + x:add(-clr, state.deltaParameters) + state.dfdx:add(-clr, state.deltaParameters) + else + x:add(-clr, dfdx) + state.dfdx:add(-clr, dfdx) + end + + -- (6) update evaluation counter + state.evalCounter = state.evalCounter + 1 + + -- return x, f(x) before optimization + return x,{fx} +end diff --git a/contrib/torch/optim/polyinterp.lua b/contrib/torch/optim/polyinterp.lua new file mode 100644 index 000000000..c5026bf04 --- /dev/null +++ b/contrib/torch/optim/polyinterp.lua @@ -0,0 +1,212 @@ +local function isreal(x) + return x == x +end + +local function isnan(x) + return not x == x +end + +local function roots(c) + local tol=1e-12 + c[torch.lt(torch.abs(c),tol)]=0 + + local nonzero = torch.ne(c,0) + if nonzero:max() == 0 then + return 0 + end + + -- first non-zero + local _,pos = torch.max(nonzero,1) + pos = pos[1] + c=c[{ {pos,-1} }] + + local nz = 0 + for i=c:size(1),1,-1 do + if c[i] ~= 0 then + break + else + nz = nz + 1 + end + end + c=c[{ {1,c:size(1)-nz} }] + + local n = c:size(1)-1 + if n == 1 then + local e = c.new({{-c[2]/c[1], 0}}) + if nz > 0 then + return torch.cat(e, c.new(nz, 2):zero(), 1) + else + return e + end + elseif n > 1 then + local A = torch.diag(c.new(n-1):fill(1),-1) + A[1] = -c[{ {2,n+1} }]/c[1]; + local e = torch.eig(A,'N') + if nz > 0 then + return torch.cat(e, c.new(nz,2):zero(), 1) + else + return e + end + else + return c.new(nz,2):zero() + end +end + +local function real(x) + if type(x) == number then return x end + return x[{ {} , 1}] +end + +local function imag(x) + if type(x) == 'number' then return 0 end + if x:nDimension() == 1 then + return x.new(x:size(1)):zero() + else + return x[{ {}, 2}] + end +end + +local function polyval(p,x) + local pwr = p:size(1) + if type(x) == 'number' then + local val = 0 + p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end) + return val + else + local val = x.new(x:size(1)) + p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end) + return val + end +end + +---------------------------------------------------------------------- +-- Minimum of interpolating polynomial based on function and +-- derivative values +-- +-- ARGS: +-- points : N triplets (x,f,g), must be a Tensor +-- xmin : min value that brackets minimum (default: min of points) +-- xmax : max value that brackets maximum (default: max of points) +-- +-- RETURN: +-- minPos : position of minimum +-- +function optim.polyinterp(points,xminBound,xmaxBound) + -- locals + local sqrt = torch.sqrt + local mean = torch.mean + local max = math.max + local min = math.min + + -- nb of points / order of polynomial + local nPoints = points:size(1) + local order = nPoints*2-1 + + -- returned values + local minPos + + -- Code for most common case: + -- + cubic interpolation of 2 points w/ function and derivative values for both + -- + no xminBound/xmaxBound + if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then + -- Solution in this case (where x2 is the farthest point): + -- d1 = g1 + g2 - 3*(f1-f2)/(x1-x2); + -- d2 = sqrt(d1^2 - g1*g2); + -- minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2)); + -- t_new = min(max(minPos,x1),x2); + local minVal,minPos = points[{ {},1 }]:min(1) + minVal = minVal[1] minPos = minPos[1] + local notMinPos = -minPos+3; + + local d1 = points[{minPos,3}] + points[{notMinPos,3}] + - 3*(points[{minPos,2}]-points[{notMinPos,2}]) + / (points[{minPos,1}]-points[{notMinPos,1}]); + local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]); + + if isreal(d2) then -- isreal() + local t = points[{notMinPos,1}] - (points[{notMinPos,1}] + - points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1) + / (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2)) + + minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}]) + else + minPos = mean(points[{{},1}]) + end + return minPos + end + + -- TODO: get the code below to work! + --error(' extrapolation not implemented yet...') + + -- Compute Bounds of Interpolation Area + local xmin = points[{{},1}]:min() + local xmax = points[{{},1}]:max() + xminBound = xminBound or xmin + xmaxBound = xmaxBound or xmax + + -- Add constraints on function values + local A = points.new(nPoints*2,order+1):zero() + local b = points.new(nPoints*2,1):zero() + for i = 1,nPoints do + local constraint = points.new(order+1):zero() + for j = order,0,-1 do + constraint[order-j+1] = points[{i,1}]^j + end + A[i] = constraint + b[i] = points[{i,2}] + end + + -- Add constraints based on derivatives + for i = 1,nPoints do + local constraint = points.new(order+1):zero() + for j = 1,order do + constraint[j] = (order-j+1)*points[{i,1}]^(order-j) + end + A[nPoints+i] = constraint + b[nPoints+i] = points[{i,3}] + end + + -- Find interpolating polynomial + local res = torch.gels(b,A) + local params = res[{ {1,nPoints*2} }]:squeeze() + + params[torch.le(torch.abs(params),1e-12)]=0 + + -- Compute Critical Points + local dParams = points.new(order):zero(); + for i = 1,params:size(1)-1 do + dParams[i] = params[i]*(order-i+1) + end + + -- nan/inf? + local nans = false + if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then + nans = true + end + + local cp = torch.cat(points.new{xminBound,xmaxBound},points[{{},1}]) + if not nans then + local cproots = roots(dParams) + local cpi = points.new(cp:size(1),2):zero() + cpi[{ {1,cp:size(1)} , 1 }] = cp + cp = torch.cat(cpi,cproots,1) + end + + -- Test Critical Points + local fmin = math.huge + -- Default to Bisection if no critical points valid: + minPos = (xminBound+xmaxBound)/2 + for i = 1,cp:size(1) do + local xCP = cp[{ {i,i} , {} }] + local ixCP = imag(xCP)[1] + local rxCP = real(xCP)[1] + if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then + local fCP = polyval(params,rxCP) + if fCP < fmin then + minPos = rxCP + fmin = fCP + end + end + end + return minPos,fmin +end diff --git a/contrib/torch/optim/rmsprop.lua b/contrib/torch/optim/rmsprop.lua new file mode 100644 index 000000000..aa562006a --- /dev/null +++ b/contrib/torch/optim/rmsprop.lua @@ -0,0 +1,58 @@ +--[[ An implementation of RMSprop + +ARGS: + +- 'opfunc' : a function that takes a single input (X), the point + of a evaluation, and returns f(X) and df/dX +- 'x' : the initial point +- 'config` : a table with configuration parameters for the optimizer +- 'config.learningRate' : learning rate +- 'config.alpha' : smoothing constant +- 'config.epsilon' : value with which to initialise m +- 'config.weightDecay' : weight decay +- 'state' : a table describing the state of the optimizer; + after each call the state is modified +- 'state.m' : leaky sum of squares of parameter gradients, +- 'state.tmp' : and the square root (with epsilon smoothing) + +RETURN: +- `x` : the new x vector +- `f(x)` : the function, evaluated before the update + +]] + +function optim.rmsprop(opfunc, x, config, state) + -- (0) get/update state + local config = config or {} + local state = state or config + local lr = config.learningRate or 1e-2 + local alpha = config.alpha or 0.99 + local epsilon = config.epsilon or 1e-8 + local wd = config.weightDecay or 0 + local mfill = config.initialMean or 0 + + -- (1) evaluate f(x) and df/dx + local fx, dfdx = opfunc(x) + + -- (2) weight decay + if wd ~= 0 then + dfdx:add(wd, x) + end + + -- (3) initialize mean square values and square gradient storage + if not state.m then + state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(mfill) + state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx) + end + + -- (4) calculate new (leaky) mean squared values + state.m:mul(alpha) + state.m:addcmul(1.0-alpha, dfdx, dfdx) + + -- (5) perform update + state.tmp:sqrt(state.m):add(epsilon) + x:addcdiv(-lr, dfdx, state.tmp) + + -- return x*, f(x) before optimization + return x, {fx} +end diff --git a/contrib/torch/optim/rprop.lua b/contrib/torch/optim/rprop.lua new file mode 100644 index 000000000..d7af16429 --- /dev/null +++ b/contrib/torch/optim/rprop.lua @@ -0,0 +1,103 @@ +--[[ A plain implementation of RPROP + +ARGS: +- `opfunc` : a function that takes a single input (X), the point of + evaluation, and returns f(X) and df/dX +- `x` : the initial point +- `state` : a table describing the state of the optimizer; after each + call the state is modified +- `state.stepsize` : initial step size, common to all components +- `state.etaplus` : multiplicative increase factor, > 1 (default 1.2) +- `state.etaminus` : multiplicative decrease factor, < 1 (default 0.5) +- `state.stepsizemax` : maximum stepsize allowed (default 50) +- `state.stepsizemin` : minimum stepsize allowed (default 1e-6) +- `state.niter` : number of iterations (default 1) + +RETURN: +- `x` : the new x vector +- `f(x)` : the function, evaluated before the update + +(Martin Riedmiller, Koray Kavukcuoglu 2013) +--]] +function optim.rprop(opfunc, x, config, state) + if config == nil and state == nil then + print('no state table RPROP initializing') + end + -- (0) get/update state + local config = config or {} + local state = state or config + local stepsize = config.stepsize or 0.1 + local etaplus = config.etaplus or 1.2 + local etaminus = config.etaminus or 0.5 + local stepsizemax = config.stepsizemax or 50.0 + local stepsizemin = config.stepsizemin or 1E-06 + local niter = config.niter or 1 + + local hfx = {} + + for i=1,niter do + + -- (1) evaluate f(x) and df/dx + local fx,dfdx = opfunc(x) + + -- init temp storage + if not state.delta then + state.delta = dfdx.new(dfdx:size()):zero() + state.stepsize = dfdx.new(dfdx:size()):fill(stepsize) + state.sign = dfdx.new(dfdx:size()) + state.psign = torch.ByteTensor(dfdx:size()) + state.nsign = torch.ByteTensor(dfdx:size()) + state.zsign = torch.ByteTensor(dfdx:size()) + state.dminmax = torch.ByteTensor(dfdx:size()) + if torch.type(x)=='torch.CudaTensor' then + -- Push to GPU + state.psign = state.psign:cuda() + state.nsign = state.nsign:cuda() + state.zsign = state.zsign:cuda() + state.dminmax = state.dminmax:cuda() + end + end + + -- sign of derivative from last step to this one + torch.cmul(state.sign, dfdx, state.delta) + torch.sign(state.sign, state.sign) + + -- get indices of >0, <0 and ==0 entries + state.sign.gt(state.psign, state.sign, 0) + state.sign.lt(state.nsign, state.sign, 0) + state.sign.eq(state.zsign, state.sign, 0) + + -- get step size updates + state.sign[state.psign] = etaplus + state.sign[state.nsign] = etaminus + state.sign[state.zsign] = 1 + + -- update stepsizes with step size updates + state.stepsize:cmul(state.sign) + + -- threshold step sizes + -- >50 => 50 + state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax) + state.stepsize[state.dminmax] = stepsizemax + -- <1e-6 ==> 1e-6 + state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin) + state.stepsize[state.dminmax] = stepsizemin + + -- for dir<0, dfdx=0 + -- for dir>=0 dfdx=dfdx + dfdx[state.nsign] = 0 + -- state.sign = sign(dfdx) + torch.sign(state.sign,dfdx) + + -- update weights + x:addcmul(-1,state.sign,state.stepsize) + + -- update state.dfdx with current dfdx + state.delta:copy(dfdx) + + table.insert(hfx,fx) + end + + -- return x*, f(x) before optimization + return x,hfx +end diff --git a/contrib/torch/optim/sgd.lua b/contrib/torch/optim/sgd.lua new file mode 100644 index 000000000..e21c696a6 --- /dev/null +++ b/contrib/torch/optim/sgd.lua @@ -0,0 +1,90 @@ +--[[ A plain implementation of SGD + +ARGS: + +- `opfunc` : a function that takes a single input (X), the point + of a evaluation, and returns f(X) and df/dX +- `x` : the initial point +- `config` : a table with configuration parameters for the optimizer +- `config.learningRate` : learning rate +- `config.learningRateDecay` : learning rate decay +- `config.weightDecay` : weight decay +- `config.weightDecays` : vector of individual weight decays +- `config.momentum` : momentum +- `config.dampening` : dampening for momentum +- `config.nesterov` : enables Nesterov momentum +- `config.learningRates` : vector of individual learning rates +- `state` : a table describing the state of the optimizer; after each + call the state is modified +- `state.evalCounter` : evaluation counter (optional: 0, by default) + +RETURN: +- `x` : the new x vector +- `f(x)` : the function, evaluated before the update + +(Clement Farabet, 2012) +]] +function optim.sgd(opfunc, x, config, state) + -- (0) get/update state + local config = config or {} + local state = state or config + local lr = config.learningRate or 1e-3 + local lrd = config.learningRateDecay or 0 + local wd = config.weightDecay or 0 + local mom = config.momentum or 0 + local damp = config.dampening or mom + local nesterov = config.nesterov or false + local lrs = config.learningRates + local wds = config.weightDecays + state.evalCounter = state.evalCounter or 0 + local nevals = state.evalCounter + assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening") + + -- (1) evaluate f(x) and df/dx + local fx,dfdx = opfunc(x) + + -- (2) weight decay with single or individual parameters + if wd ~= 0 then + dfdx:add(wd, x) + elseif wds then + if not state.decayParameters then + state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) + end + state.decayParameters:copy(wds):cmul(x) + dfdx:add(state.decayParameters) + end + + -- (3) apply momentum + if mom ~= 0 then + if not state.dfdx then + state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx) + else + state.dfdx:mul(mom):add(1-damp, dfdx) + end + if nesterov then + dfdx:add(mom, state.dfdx) + else + dfdx = state.dfdx + end + end + + -- (4) learning rate decay (annealing) + local clr = lr / (1 + nevals*lrd) + + -- (5) parameter update with single or individual learning rates + if lrs then + if not state.deltaParameters then + state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) + end + state.deltaParameters:copy(lrs):cmul(dfdx) + x:add(-clr, state.deltaParameters) + else + x:add(-clr, dfdx) + end + + -- (6) update evaluation counter + state.evalCounter = state.evalCounter + 1 + + -- return x*, f(x) before optimization + return x,{fx} +end -- 2.39.5