diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-05-23 18:14:15 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-05-23 18:14:15 +0100 |
commit | 714eb56e1760fdfb26afccde92664d3a2f1e8435 (patch) | |
tree | 84d1399acbb92f852b4bd64f9ea5412680b0c6ab /contrib/torch/optim | |
parent | 220a51ff68013dd668a45b78c60a7b8bfc10f074 (diff) | |
download | rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.tar.gz rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.zip |
[Minor] Move lua contrib libraries to lua- prefix
Diffstat (limited to 'contrib/torch/optim')
-rw-r--r-- | contrib/torch/optim/CMakeLists.txt | 5 | ||||
-rw-r--r-- | contrib/torch/optim/COPYRIGHT.txt | 35 | ||||
-rw-r--r-- | contrib/torch/optim/ConfusionMatrix.lua | 361 | ||||
-rw-r--r-- | contrib/torch/optim/Logger.lua | 189 | ||||
-rw-r--r-- | contrib/torch/optim/adadelta.lua | 55 | ||||
-rw-r--r-- | contrib/torch/optim/adagrad.lua | 55 | ||||
-rw-r--r-- | contrib/torch/optim/adam.lua | 72 | ||||
-rw-r--r-- | contrib/torch/optim/adamax.lua | 66 | ||||
-rw-r--r-- | contrib/torch/optim/asgd.lua | 73 | ||||
-rw-r--r-- | contrib/torch/optim/cg.lua | 208 | ||||
-rw-r--r-- | contrib/torch/optim/checkgrad.lua | 52 | ||||
-rw-r--r-- | contrib/torch/optim/cmaes.lua | 270 | ||||
-rw-r--r-- | contrib/torch/optim/de.lua | 109 | ||||
-rw-r--r-- | contrib/torch/optim/fista.lua | 192 | ||||
-rw-r--r-- | contrib/torch/optim/init.lua | 33 | ||||
-rw-r--r-- | contrib/torch/optim/lbfgs.lua | 268 | ||||
-rw-r--r-- | contrib/torch/optim/lswolfe.lua | 192 | ||||
-rw-r--r-- | contrib/torch/optim/nag.lua | 86 | ||||
-rw-r--r-- | contrib/torch/optim/polyinterp.lua | 212 | ||||
-rw-r--r-- | contrib/torch/optim/rmsprop.lua | 58 | ||||
-rw-r--r-- | contrib/torch/optim/rprop.lua | 103 | ||||
-rw-r--r-- | contrib/torch/optim/sgd.lua | 90 |
22 files changed, 0 insertions, 2784 deletions
diff --git a/contrib/torch/optim/CMakeLists.txt b/contrib/torch/optim/CMakeLists.txt deleted file mode 100644 index b1c13e701..000000000 --- a/contrib/torch/optim/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ - -CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) -SET(src) -FILE(GLOB luasrc *.lua) -ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}") diff --git a/contrib/torch/optim/COPYRIGHT.txt b/contrib/torch/optim/COPYRIGHT.txt deleted file mode 100644 index 2e4118c0f..000000000 --- a/contrib/torch/optim/COPYRIGHT.txt +++ /dev/null @@ -1,35 +0,0 @@ -Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) -Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) -Copyright (c) 2011-2013 NYU (Clement Farabet) -Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) -Copyright (c) 2006 Idiap Research Institute (Samy Bengio) -Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -3. Neither the names of NEC Laboratories American and IDIAP Research - Institute nor the names of its contributors may be used to endorse or - promote products derived from this software without specific prior - written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/torch/optim/ConfusionMatrix.lua b/contrib/torch/optim/ConfusionMatrix.lua deleted file mode 100644 index ec5302c24..000000000 --- a/contrib/torch/optim/ConfusionMatrix.lua +++ /dev/null @@ -1,361 +0,0 @@ ---[[ A Confusion Matrix class - -Example: - - conf = optim.ConfusionMatrix( {'cat','dog','person'} ) -- new matrix - conf:zero() -- reset matrix - for i = 1,N do - conf:add( neuralnet:forward(sample), label ) -- accumulate errors - end - print(conf) -- print matrix - image.display(conf:render()) -- render matrix -]] -local ConfusionMatrix = torch.class('optim.ConfusionMatrix') - -function ConfusionMatrix:__init(nclasses, classes) - if type(nclasses) == 'table' then - classes = nclasses - nclasses = #classes - end - self.mat = torch.LongTensor(nclasses,nclasses):zero() - self.valids = torch.FloatTensor(nclasses):zero() - self.unionvalids = torch.FloatTensor(nclasses):zero() - self.nclasses = nclasses - self.totalValid = 0 - self.averageValid = 0 - self.classes = classes or {} - -- buffers - self._mat_flat = self.mat:view(-1) - self._target = torch.FloatTensor() - self._prediction = torch.FloatTensor() - self._max = torch.FloatTensor() - self._pred_idx = torch.LongTensor() - self._targ_idx = torch.LongTensor() -end - --- takes scalar prediction and target as input -function ConfusionMatrix:_add(p, t) - assert(p and type(p) == 'number') - assert(t and type(t) == 'number') - -- non-positive values are considered missing - -- and therefore ignored - if t > 0 then - self.mat[t][p] = self.mat[t][p] + 1 - end -end - -function ConfusionMatrix:add(prediction, target) - if type(prediction) == 'number' then - -- comparing numbers - self:_add(prediction, target) - else - self._prediction:resize(prediction:size()):copy(prediction) - assert(prediction:dim() == 1) - if type(target) == 'number' then - -- prediction is a vector, then target assumed to be an index - self._max:max(self._pred_idx, self._prediction, 1) - self:_add(self._pred_idx[1], target) - else - -- both prediction and target are vectors - assert(target:dim() == 1) - self._target:resize(target:size()):copy(target) - self._max:max(self._targ_idx, self._target, 1) - self._max:max(self._pred_idx, self._prediction, 1) - self:_add(self._pred_idx[1], self._targ_idx[1]) - end - end -end - -function ConfusionMatrix:batchAdd(predictions, targets) - local preds, targs, __ - self._prediction:resize(predictions:size()):copy(predictions) - if predictions:dim() == 1 then - -- predictions is a vector of classes - preds = self._prediction - elseif predictions:dim() == 2 then - -- prediction is a matrix of class likelihoods - if predictions:size(2) == 1 then - -- or prediction just needs flattening - preds = self._prediction:select(2,1) - else - self._max:max(self._pred_idx, self._prediction, 2) - preds = self._pred_idx:select(2,1) - end - else - error("predictions has invalid number of dimensions") - end - - self._target:resize(targets:size()):copy(targets) - if targets:dim() == 1 then - -- targets is a vector of classes - targs = self._target - elseif targets:dim() == 2 then - -- targets is a matrix of one-hot rows - if targets:size(2) == 1 then - -- or targets just needs flattening - targs = self._target:select(2,1) - else - self._max:max(self._targ_idx, self._target, 2) - targs = self._targ_idx:select(2,1) - end - else - error("targets has invalid number of dimensions") - end - - -- non-positive values are considered missing and therefore ignored - local mask = targs:ge(1) - targs = targs[mask] - preds = preds[mask] - - self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility - - preds = preds:typeAs(targs) - - assert(self.mat:isContiguous() and self.mat:stride(2) == 1) - local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat) - local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1)) - self._mat_flat:indexAdd(1, indices, ones) -end - -function ConfusionMatrix:zero() - self.mat:zero() - self.valids:zero() - self.unionvalids:zero() - self.totalValid = 0 - self.averageValid = 0 -end - -local function isNaN(number) - return number ~= number -end - -function ConfusionMatrix:updateValids() - local total = 0 - for t = 1,self.nclasses do - self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum() - self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t]) - total = total + self.mat[t][t] - end - self.totalValid = total / self.mat:sum() - self.averageValid = 0 - self.averageUnionValid = 0 - local nvalids = 0 - local nunionvalids = 0 - for t = 1,self.nclasses do - if not isNaN(self.valids[t]) then - self.averageValid = self.averageValid + self.valids[t] - nvalids = nvalids + 1 - end - if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then - self.averageUnionValid = self.averageUnionValid + self.unionvalids[t] - nunionvalids = nunionvalids + 1 - end - end - self.averageValid = self.averageValid / nvalids - self.averageUnionValid = self.averageUnionValid / nunionvalids -end - --- Calculating FAR/FRR associated with the confusion matrix - -function ConfusionMatrix:farFrr() - local cmat = self.mat - local noOfClasses = cmat:size()[1] - self._frrs = self._frrs or torch.zeros(noOfClasses) - self._frrs:zero() - self._classFrrs = self._classFrrs or torch.zeros(noOfClasses) - self._classFrrs:zero() - self._classFrrs:add(-1) - self._fars = self._fars or torch.zeros(noOfClasses) - self._fars:zero() - self._classFars = self._classFars or torch.zeros(noOfClasses) - self._classFars:zero() - self._classFars:add(-1) - local classSamplesCount = cmat:sum(2) - local indx = 1 - for i=1,noOfClasses do - if classSamplesCount[i][1] ~= 0 then - self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1] - self._classFrrs[i] = self._frrs[indx] - -- Calculating FARs - local farNumerator = 0 - local farDenominator = 0 - for j=1, noOfClasses do - if i ~= j then - if classSamplesCount[j][1] ~= 0 then - farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1] - farDenominator = farDenominator + 1 - end - end - end - self._fars[indx] = farNumerator/farDenominator - self._classFars[i] = self._fars[indx] - indx = indx + 1 - end - end - indx = indx - 1 - local returnFrrs = self._frrs[{{1, indx}}] - local returnFars = self._fars[{{1, indx}}] - return self._classFrrs, self._classFars, returnFrrs, returnFars -end - -local function log10(n) - if math.log10 then - return math.log10(n) - else - return math.log(n) / math.log(10) - end -end - -function ConfusionMatrix:__tostring__() - self:updateValids() - local str = {'ConfusionMatrix:\n'} - local nclasses = self.nclasses - table.insert(str, '[') - local maxCnt = self.mat:max() - local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt))) - for t = 1,nclasses do - local pclass = self.valids[t] * 100 - pclass = string.format('%2.3f', pclass) - if t == 1 then - table.insert(str, '[') - else - table.insert(str, ' [') - end - for p = 1,nclasses do - table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p])) - end - if self.classes and self.classes[1] then - if t == nclasses then - table.insert(str, ']] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') - else - table.insert(str, '] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n') - end - else - if t == nclasses then - table.insert(str, ']] ' .. pclass .. '% \n') - else - table.insert(str, '] ' .. pclass .. '% \n') - end - end - end - table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n') - table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n') - table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%') - return table.concat(str) -end - -function ConfusionMatrix:render(sortmode, display, block, legendwidth) - -- args - local confusion = self.mat:double() - local classes = self.classes - local sortmode = sortmode or 'score' -- 'score' or 'occurrence' - local block = block or 25 - local legendwidth = legendwidth or 200 - local display = display or false - - -- legends - local legend = { - ['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]', - ['occurrence'] = 'Confusion matrix [sorted by occurrences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]' - } - - -- parse matrix / normalize / count scores - local diag = torch.FloatTensor(#classes) - local freqs = torch.FloatTensor(#classes) - local unconf = confusion - local confusion = confusion:clone() - local corrects = 0 - local total = 0 - for target = 1,#classes do - freqs[target] = confusion[target]:sum() - corrects = corrects + confusion[target][target] - total = total + freqs[target] - confusion[target]:div( math.max(confusion[target]:sum(),1) ) - diag[target] = confusion[target][target] - end - - -- accuracies - local accuracy = corrects / total * 100 - local perclass = 0 - local total = 0 - for target = 1,#classes do - if confusion[target]:sum() > 0 then - perclass = perclass + diag[target] - total = total + 1 - end - end - perclass = perclass / total * 100 - freqs:div(unconf:sum()) - - -- sort matrix - if sortmode == 'score' then - _,order = torch.sort(diag,1,true) - elseif sortmode == 'occurrence' then - _,order = torch.sort(freqs,1,true) - else - error('sort mode must be one of: score | occurrence') - end - - -- render matrix - local render = torch.zeros(#classes*block, #classes*block) - for target = 1,#classes do - for prediction = 1,#classes do - render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]] - end - end - - -- add grid - for target = 1,#classes do - render[{ {target*block},{} }] = 0.1 - render[{ {},{target*block} }] = 0.1 - end - - -- create rendering - require 'image' - require 'qtwidget' - require 'qttorch' - local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] ) - image.display{image=render, win=win1} - - -- add legend - for i in ipairs(classes) do - -- background cell - win1:setcolor{r=0,g=0,b=0} - win1:rectangle((#render)[2],(i-1)*block,legendwidth,block) - win1:fill() - - -- % - win1:setfont(qt.QFont{serif=false, size=fontsize}) - local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4 - win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2} - win1:moveto((#render)[2]+10,i*block-block/3) - win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100)) - - -- legend - win1:setfont(qt.QFont{serif=false, size=fontsize}) - local gscale = diag[order[i]]*0.8+0.2 - win1:setcolor{r=gscale,g=gscale,b=gscale} - win1:moveto(120+(#render)[2]+10,i*block-block/3) - win1:show(classes[order[i]]) - - for j in ipairs(classes) do - -- scores - local score = confusion[order[j]][order[i]] - local gscale = (1-score)*(score*0.8+0.2) - win1:setcolor{r=gscale,g=gscale,b=gscale} - win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3) - win1:show(string.format('%02.0f',math.floor(score*100+0.5))) - end - end - - -- generate tensor - local t = win1:image():toTensor() - - -- display - if display then - image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)} - end - - -- return rendering - return t -end diff --git a/contrib/torch/optim/Logger.lua b/contrib/torch/optim/Logger.lua deleted file mode 100644 index 3c1da54d3..000000000 --- a/contrib/torch/optim/Logger.lua +++ /dev/null @@ -1,189 +0,0 @@ ---[[ Logger: a simple class to log symbols during training, - and automate plot generation - -Example: - logger = optim.Logger('somefile.log') -- file to save stuff - - for i = 1,N do -- log some symbols during - train_error = ... -- training/testing - test_error = ... - logger:add{['training error'] = train_error, - ['test error'] = test_error} - end - - logger:style{['training error'] = '-', -- define styles for plots - ['test error'] = '-'} - logger:plot() -- and plot - ----- OR --- - - logger = optim.Logger('somefile.log') -- file to save stuff - logger:setNames{'training error', 'test error'} - - for i = 1,N do -- log some symbols during - train_error = ... -- training/testing - test_error = ... - logger:add{train_error, test_error} - end - - logger:style{'-', '-'} -- define styles for plots - logger:plot() -- and plot - ------------ - - logger:setlogscale(true) -- enable logscale on Y-axis - logger:plot() -- and plot -]] -local Logger = torch.class('optim.Logger') - -function Logger:__init(filename, timestamp) - if filename then - self.name = filename - os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"') - if timestamp then - -- append timestamp to create unique log file - filename = filename .. '-'..os.date("%Y_%m_%d_%X") - end - self.file = io.open(filename,'w') - self.epsfile = self.name .. '.eps' - else - self.file = io.stdout - self.name = 'stdout' - print('<Logger> warning: no path provided, logging to std out') - end - self.empty = true - self.symbols = {} - self.styles = {} - self.names = {} - self.idx = {} - self.figure = nil - self.showPlot = true - self.plotRawCmd = nil - self.defaultStyle = '+' - self.logscale = false -end - -function Logger:setNames(names) - self.names = names - self.empty = false - self.nsymbols = #names - for k,key in pairs(names) do - self.file:write(key .. '\t') - self.symbols[k] = {} - self.styles[k] = {self.defaultStyle} - self.idx[key] = k - end - self.file:write('\n') - self.file:flush() - return self -end - -function Logger:add(symbols) - -- (1) first time ? print symbols' names on first row - if self.empty then - self.empty = false - self.nsymbols = #symbols - for k,val in pairs(symbols) do - self.file:write(k .. '\t') - self.symbols[k] = {} - self.styles[k] = {self.defaultStyle} - self.names[k] = k - end - self.idx = self.names - self.file:write('\n') - end - -- (2) print all symbols on one row - for k,val in pairs(symbols) do - if type(val) == 'number' then - self.file:write(string.format('%11.4e',val) .. '\t') - elseif type(val) == 'string' then - self.file:write(val .. '\t') - else - xlua.error('can only log numbers and strings', 'Logger') - end - end - self.file:write('\n') - self.file:flush() - -- (3) save symbols in internal table - for k,val in pairs(symbols) do - table.insert(self.symbols[k], val) - end -end - -function Logger:style(symbols) - for name,style in pairs(symbols) do - if type(style) == 'string' then - self.styles[name] = {style} - elseif type(style) == 'table' then - self.styles[name] = style - else - xlua.error('style should be a string or a table of strings','Logger') - end - end - return self -end - -function Logger:setlogscale(state) - self.logscale = state -end - -function Logger:display(state) - self.showPlot = state -end - -function Logger:plot(...) - if not xlua.require('gnuplot') then - if not self.warned then - print('<Logger> warning: cannot plot with this version of Torch') - self.warned = true - end - return - end - local plotit = false - local plots = {} - local plotsymbol = - function(name,list) - if #list > 1 then - local nelts = #list - local plot_y = torch.Tensor(nelts) - for i = 1,nelts do - plot_y[i] = list[i] - end - for _,style in ipairs(self.styles[name]) do - table.insert(plots, {self.names[name], plot_y, style}) - end - plotit = true - end - end - local args = {...} - if not args[1] then -- plot all symbols - for name,list in pairs(self.symbols) do - plotsymbol(name,list) - end - else -- plot given symbols - for _,name in ipairs(args) do - plotsymbol(self.idx[name], self.symbols[self.idx[name]]) - end - end - if plotit then - if self.showPlot then - self.figure = gnuplot.figure(self.figure) - if self.logscale then gnuplot.logscale('on') end - gnuplot.plot(plots) - if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end - gnuplot.grid('on') - gnuplot.title('<Logger::' .. self.name .. '>') - end - if self.epsfile then - os.execute('rm -f "' .. self.epsfile .. '"') - local epsfig = gnuplot.epsfigure(self.epsfile) - if self.logscale then gnuplot.logscale('on') end - gnuplot.plot(plots) - if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end - gnuplot.grid('on') - gnuplot.title('<Logger::' .. self.name .. '>') - gnuplot.plotflush() - gnuplot.close(epsfig) - end - end -end diff --git a/contrib/torch/optim/adadelta.lua b/contrib/torch/optim/adadelta.lua deleted file mode 100644 index 7cc058d29..000000000 --- a/contrib/torch/optim/adadelta.lua +++ /dev/null @@ -1,55 +0,0 @@ ---[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701 - -ARGS: -- `opfunc` : a function that takes a single input (X), the point of - evaluation, and returns f(X) and df/dX -- `x` : the initial point -- `config` : a table of hyper-parameters -- `config.rho` : interpolation parameter -- `config.eps` : for numerical stability -- `config.weightDecay` : weight decay -- `state` : a table describing the state of the optimizer; after each - call the state is modified -- `state.paramVariance` : vector of temporal variances of parameters -- `state.accDelta` : vector of accummulated delta of gradients -RETURN: -- `x` : the new x vector -- `f(x)` : the function, evaluated before the update -]] -function optim.adadelta(opfunc, x, config, state) - -- (0) get/update state - if config == nil and state == nil then - print('no state table, ADADELTA initializing') - end - local config = config or {} - local state = state or config - local rho = config.rho or 0.9 - local eps = config.eps or 1e-6 - local wd = config.weightDecay or 0 - state.evalCounter = state.evalCounter or 0 - -- (1) evaluate f(x) and df/dx - local fx,dfdx = opfunc(x) - - -- (2) weight decay - if wd ~= 0 then - dfdx:add(wd, x) - end - - -- (3) parameter update - if not state.paramVariance then - state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() - state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() - state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() - state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() - end - state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx) - state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt() - state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx) - x:add(-1, state.delta) - state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta) - -- (4) update evaluation counter - state.evalCounter = state.evalCounter + 1 - - -- return x*, f(x) before optimization - return x,{fx} -end diff --git a/contrib/torch/optim/adagrad.lua b/contrib/torch/optim/adagrad.lua deleted file mode 100644 index 6860c4317..000000000 --- a/contrib/torch/optim/adagrad.lua +++ /dev/null @@ -1,55 +0,0 @@ ---[[ ADAGRAD implementation for SGD - -ARGS: -- `opfunc` : a function that takes a single input (X), the point of - evaluation, and returns f(X) and df/dX -- `x` : the initial point -- `state` : a table describing the state of the optimizer; after each - call the state is modified -- `state.learningRate` : learning rate -- `state.paramVariance` : vector of temporal variances of parameters -- `state.weightDecay` : scalar that controls weight decay -RETURN: -- `x` : the new x vector -- `f(x)` : the function, evaluated before the update - -]] -function optim.adagrad(opfunc, x, config, state) - -- (0) get/update state - if config == nil and state == nil then - print('no state table, ADAGRAD initializing') - end - local config = config or {} - local state = state or config - local lr = config.learningRate or 1e-3 - local lrd = config.learningRateDecay or 0 - local wd = config.weightDecay or 0 - state.evalCounter = state.evalCounter or 0 - local nevals = state.evalCounter - - -- (1) evaluate f(x) and df/dx - local fx,dfdx = opfunc(x) - - -- (2) weight decay with a single parameter - if wd ~= 0 then - dfdx:add(wd, x) - end - - -- (3) learning rate decay (annealing) - local clr = lr / (1 + nevals*lrd) - - -- (4) parameter update with single or individual learning rates - if not state.paramVariance then - state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() - state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx) - end - state.paramVariance:addcmul(1,dfdx,dfdx) - state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt() - x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10)) - - -- (5) update evaluation counter - state.evalCounter = state.evalCounter + 1 - - -- return x*, f(x) before optimization - return x,{fx} -end diff --git a/contrib/torch/optim/adam.lua b/contrib/torch/optim/adam.lua deleted file mode 100644 index 2e127e96a..000000000 --- a/contrib/torch/optim/adam.lua +++ /dev/null @@ -1,72 +0,0 @@ ---[[ An implementation of Adam https://arxiv.org/abs/1412.6980 - -ARGS: - -- 'opfunc' : a function that takes a single input (X), the point - of a evaluation, and returns f(X) and df/dX -- 'x' : the initial point -- 'config` : a table with configuration parameters for the optimizer -- 'config.learningRate' : learning rate -- `config.learningRateDecay` : learning rate decay -- 'config.beta1' : first moment coefficient -- 'config.beta2' : second moment coefficient -- 'config.epsilon' : for numerical stability -- 'config.weightDecay' : weight decay -- 'state' : a table describing the state of the optimizer; after each - call the state is modified - -RETURN: -- `x` : the new x vector -- `f(x)` : the function, evaluated before the update - -]] - -function optim.adam(opfunc, x, config, state) - -- (0) get/update state - local config = config or {} - local state = state or config - local lr = config.learningRate or 0.001 - local lrd = config.learningRateDecay or 0 - - local beta1 = config.beta1 or 0.9 - local beta2 = config.beta2 or 0.999 - local epsilon = config.epsilon or 1e-8 - local wd = config.weightDecay or 0 - - -- (1) evaluate f(x) and df/dx - local fx, dfdx = opfunc(x) - - -- (2) weight decay - if wd ~= 0 then - dfdx:add(wd, x) - end - - -- Initialization - state.t = state.t or 0 - -- Exponential moving average of gradient values - state.m = state.m or x.new(dfdx:size()):zero() - -- Exponential moving average of squared gradient values - state.v = state.v or x.new(dfdx:size()):zero() - -- A tmp tensor to hold the sqrt(v) + epsilon - state.denom = state.denom or x.new(dfdx:size()):zero() - - -- (3) learning rate decay (annealing) - local clr = lr / (1 + state.t*lrd) - - state.t = state.t + 1 - - -- Decay the first and second moment running average coefficient - state.m:mul(beta1):add(1-beta1, dfdx) - state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx) - - state.denom:copy(state.v):sqrt():add(epsilon) - - local biasCorrection1 = 1 - beta1^state.t - local biasCorrection2 = 1 - beta2^state.t - local stepSize = clr * math.sqrt(biasCorrection2)/biasCorrection1 - -- (4) update x - x:addcdiv(-stepSize, state.m, state.denom) - - -- return x*, f(x) before optimization - return x, {fx} -end diff --git a/contrib/torch/optim/adamax.lua b/contrib/torch/optim/adamax.lua deleted file mode 100644 index 2b6487720..000000000 --- a/contrib/torch/optim/adamax.lua +++ /dev/null @@ -1,66 +0,0 @@ ---[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf - -ARGS: - -- 'opfunc' : a function that takes a single input (X), the point - of a evaluation, and returns f(X) and df/dX -- 'x' : the initial point -- 'config` : a table with configuration parameters for the optimizer -- 'config.learningRate' : learning rate -- 'config.beta1' : first moment coefficient -- 'config.beta2' : second moment coefficient -- 'config.epsilon' : for numerical stability -- 'state' : a table describing the state of the optimizer; - after each call the state is modified. - -RETURN: -- `x` : the new x vector -- `f(x)` : the function, evaluated before the update - -]] - -function optim.adamax(opfunc, x, config, state) - -- (0) get/update state - local config = config or {} - local state = state or config - local lr = config.learningRate or 0.002 - - local beta1 = config.beta1 or 0.9 - local beta2 = config.beta2 or 0.999 - local epsilon = config.epsilon or 1e-38 - local wd = config.weightDecay or 0 - - -- (1) evaluate f(x) and df/dx - local fx, dfdx = opfunc(x) - - -- (2) weight decay - if wd ~= 0 then - dfdx:add(wd, x) - end - - -- Initialization - state.t = state.t or 0 - -- Exponential moving average of gradient values - state.m = state.m or x.new(dfdx:size()):zero() - -- Exponential moving average of the infinity norm - state.u = state.u or x.new(dfdx:size()):zero() - -- A tmp tensor to hold the input to max() - state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero() - - state.t = state.t + 1 - - -- Update biased first moment estimate. - state.m:mul(beta1):add(1-beta1, dfdx) - -- Update the exponentially weighted infinity norm. - state.max[1]:copy(state.u):mul(beta2) - state.max[2]:copy(dfdx):abs():add(epsilon) - state.u:max(state.max, 1) - - local biasCorrection1 = 1 - beta1^state.t - local stepSize = lr/biasCorrection1 - -- (2) update x - x:addcdiv(-stepSize, state.m, state.u) - - -- return x*, f(x) before optimization - return x, {fx} -end diff --git a/contrib/torch/optim/asgd.lua b/contrib/torch/optim/asgd.lua deleted file mode 100644 index cc1c459f3..000000000 --- a/contrib/torch/optim/asgd.lua +++ /dev/null @@ -1,73 +0,0 @@ ---[[ An implementation of ASGD - -ASGD: - - x := (1 - lambda eta_t) x - eta_t df/dx(z,x) - a := a + mu_t [ x - a ] - - eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75 - mu_t = 1/max(1,t-t0) - -implements ASGD algoritm as in L.Bottou's sgd-2.0 - -ARGS: - -- `opfunc` : a function that takes a single input (X), the point of - evaluation, and returns f(X) and df/dX -- `x` : the initial point -- `state` : a table describing the state of the optimizer; after each - call the state is modified -- `state.eta0` : learning rate -- `state.lambda` : decay term -- `state.alpha` : power for eta update -- `state.t0` : point at which to start averaging - -RETURN: -- `x` : the new x vector -- `f(x)` : the function, evaluated before the update -- `ax` : the averaged x vector - -(Clement Farabet, 2012) ---]] -function optim.asgd(opfunc, x, config, state) - -- (0) get/update state - local config = config or {} - local state = state or config - config.eta0 = config.eta0 or 1e-4 - config.lambda = config.lambda or 1e-4 - config.alpha = config.alpha or 0.75 - config.t0 = config.t0 or 1e6 - - -- (hidden state) - state.eta_t = state.eta_t or config.eta0 - state.mu_t = state.mu_t or 1 - state.t = state.t or 0 - - -- (1) evaluate f(x) and df/dx - local fx,dfdx = opfunc(x) - - -- (2) decay term - x:mul(1 - config.lambda*state.eta_t) - - -- (3) update x - x:add(-state.eta_t, dfdx) - - -- (4) averaging - state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero() - state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax) - if state.mu_t ~= 1 then - state.tmp:copy(x) - state.tmp:add(-1,state.ax):mul(state.mu_t) - state.ax:add(state.tmp) - else - state.ax:copy(x) - end - - -- (5) update eta_t and mu_t - state.t = state.t + 1 - state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha) - state.mu_t = 1 / math.max(1, state.t - config.t0) - - -- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...) - return x,{fx},state.ax -end diff --git a/contrib/torch/optim/cg.lua b/contrib/torch/optim/cg.lua deleted file mode 100644 index 842a7d569..000000000 --- a/contrib/torch/optim/cg.lua +++ /dev/null @@ -1,208 +0,0 @@ ---[[ - -This cg implementation is a rewrite of minimize.m written by Carl -E. Rasmussen. It is supposed to produce exactly same results (give -or take numerical accuracy due to some changed order of -operations). You can compare the result on rosenbrock with minimize.m. -http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html - - [x fx c] = minimize([0 0]', 'rosenbrock', -25) - -Note that we limit the number of function evaluations only, it seems much -more important in practical use. - -ARGS: - -- `opfunc` : a function that takes a single input, the point of evaluation. -- `x` : the initial point -- `state` : a table of parameters and temporary allocations. -- `state.maxEval` : max number of function evaluations -- `state.maxIter` : max number of iterations -- `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage -- `state.[s,x0]` : if you pass torch.Tensor they will be used for temp storage - -RETURN: - -- `x*` : the new x vector, at the optimal point -- `f` : a table of all function values where - `f[1]` is the value of the function before any optimization and - `f[#f]` is the final fully optimized value, at x* - -(Koray Kavukcuoglu, 2012) ---]] -function optim.cg(opfunc, x, config, state) - -- parameters - local config = config or {} - local state = state or config - local rho = config.rho or 0.01 - local sig = config.sig or 0.5 - local int = config.int or 0.1 - local ext = config.ext or 3.0 - local maxIter = config.maxIter or 20 - local ratio = config.ratio or 100 - local maxEval = config.maxEval or maxIter*1.25 - local red = 1 - - local verbose = config.verbose or 0 - - local i = 0 - local ls_failed = 0 - local fx = {} - - -- we need three points for the interpolation/extrapolation stuff - local z1,z2,z3 = 0,0,0 - local d1,d2,d3 = 0,0,0 - local f1,f2,f3 = 0,0,0 - - local df1 = state.df1 or x.new() - local df2 = state.df2 or x.new() - local df3 = state.df3 or x.new() - local tdf - - df1:resizeAs(x) - df2:resizeAs(x) - df3:resizeAs(x) - - -- search direction - local s = state.s or x.new() - s:resizeAs(x) - - -- we need a temp storage for X - local x0 = state.x0 or x.new() - local f0 = 0 - local df0 = state.df0 or x.new() - x0:resizeAs(x) - df0:resizeAs(x) - - -- evaluate at initial point - f1,tdf = opfunc(x) - fx[#fx+1] = f1 - df1:copy(tdf) - i=i+1 - - -- initial search direction - s:copy(df1):mul(-1) - - d1 = -s:dot(s ) -- slope - z1 = red/(1-d1) -- initial step - - while i < math.abs(maxEval) do - - x0:copy(x) - f0 = f1 - df0:copy(df1) - - x:add(z1,s) - f2,tdf = opfunc(x) - df2:copy(tdf) - i=i+1 - d2 = df2:dot(s) - f3,d3,z3 = f1,d1,-z1 -- init point 3 equal to point 1 - local m = math.min(maxIter,maxEval-i) - local success = 0 - local limit = -1 - - while true do - while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do - limit = z1 - if f2 > f1 then - z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3) - else - local A = 6*(f2-f3)/z3+3*(d2+d3) - local B = 3*(f3-f2)-z3*(d3+2*d2) - z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A - end - if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then - z2 = z3/2; - end - z2 = math.max(math.min(z2, int*z3),(1-int)*z3); - z1 = z1 + z2; - x:add(z2,s) - f2,tdf = opfunc(x) - df2:copy(tdf) - i=i+1 - m = m - 1 - d2 = df2:dot(s) - z3 = z3-z2; - end - if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then - break - elseif d2 > sig*d1 then - success = 1; - break; - elseif m == 0 then - break; - end - local A = 6*(f2-f3)/z3+3*(d2+d3); - local B = 3*(f3-f2)-z3*(d3+2*d2); - z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3)) - - if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then - if limit < -0.5 then - z2 = z1 * (ext -1) - else - z2 = (limit-z1)/2 - end - elseif (limit > -0.5) and (z2+z1) > limit then - z2 = (limit-z1)/2 - elseif limit < -0.5 and (z2+z1) > z1*ext then - z2 = z1*(ext-1) - elseif z2 < -z3*int then - z2 = -z3*int - elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then - z2 = (limit-z1)*(1-int) - end - f3=f2; d3=d2; z3=-z2; - z1 = z1+z2; - x:add(z2,s) - - f2,tdf = opfunc(x) - df2:copy(tdf) - i=i+1 - m = m - 1 - d2 = df2:dot(s) - end - if success == 1 then - f1 = f2 - fx[#fx+1] = f1; - local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1) - s:mul(ss) - s:add(-1,df2) - local tmp = df1:clone() - df1:copy(df2) - df2:copy(tmp) - d2 = df1:dot(s) - if d2> 0 then - s:copy(df1) - s:mul(-1) - d2 = -s:dot(s) - end - - z1 = z1 * math.min(ratio, d1/(d2-1e-320)) - d1 = d2 - ls_failed = 0 - else - x:copy(x0) - f1 = f0 - df1:copy(df0) - if ls_failed or i>maxEval then - break - end - local tmp = df1:clone() - df1:copy(df2) - df2:copy(tmp) - s:copy(df1) - s:mul(-1) - d1 = -s:dot(s) - z1 = 1/(1-d1) - ls_failed = 1 - end - end - state.df0 = df0 - state.df1 = df1 - state.df2 = df2 - state.df3 = df3 - state.x0 = x0 - state.s = s - return x,fx,i -end diff --git a/contrib/torch/optim/checkgrad.lua b/contrib/torch/optim/checkgrad.lua deleted file mode 100644 index 0382b2132..000000000 --- a/contrib/torch/optim/checkgrad.lua +++ /dev/null @@ -1,52 +0,0 @@ ---[[ An implementation of a simple numerical gradient checker. - -ARGS: - -- `opfunc` : a function that takes a single input (X), the point of - evaluation, and returns f(X) and df/dX -- `x` : the initial point -- `eps` : the epsilon to use for the numerical check (default is 1e-7) - -RETURN: - -- `diff` : error in the gradient, should be near tol -- `dC` : exact gradient at point -- `dC_est` : numerically estimates gradient at point - -]]-- - - --- function that numerically checks gradient of NCA loss: -function optim.checkgrad(opfunc, x, eps) - - -- compute true gradient: - local Corg,dC = opfunc(x) - dC:resize(x:size()) - - local Ctmp -- temporary value - local isTensor = torch.isTensor(Corg) - if isTensor then - Ctmp = Corg.new(Corg:size()) - end - - -- compute numeric approximations to gradient: - local eps = eps or 1e-7 - local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC) - for i = 1,dC:size(1) do - local tmp = x[i] - x[i] = x[i] + eps - local C1 = opfunc(x) - if isTensor then - Ctmp:copy(C1) - C1 = Ctmp - end - x[i] = x[i] - 2 * eps - local C2 = opfunc(x) - x[i] = tmp - dC_est[i] = (C1 - C2) / (2 * eps) - end - - -- estimate error of gradient: - local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est) - return diff,dC,dC_est -end diff --git a/contrib/torch/optim/cmaes.lua b/contrib/torch/optim/cmaes.lua deleted file mode 100644 index 74cd58a0c..000000000 --- a/contrib/torch/optim/cmaes.lua +++ /dev/null @@ -1,270 +0,0 @@ -require 'torch' -require 'math' - -local BestSolution = {} ---[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), -ported from https://www.lri.fr/~hansen/barecmaes2.html. - -Parameters ----------- -ARGS: - -- `opfunc` : a function that takes a single input (X), the point of - evaluation, and returns f(X) and df/dX. Note that df/dX is not used -- `x` : the initial point -- `state.sigma` - float, initial step-size (standard deviation in each - coordinate) -- `state.maxEval` - int, maximal number of function evaluations -- `state.ftarget` - float, target function value -- `state.popsize` - population size. If this is left empty, - 4 + int(3 * log(|x|)) will be used -- `state.ftarget` - stop if fitness < ftarget -- `state.verb_disp` - int, display on console every verb_disp iteration, 0 for never - -RETURN: -- `x*` : the new `x` vector, at the optimal point -- `f` : a table of all function values: - `f[1]` is the value of the function before any optimization and - `f[#f]` is the final fully optimized value, at `x*` ---]] -function optim.cmaes(opfunc, x, config, state) - if (x.triu == nil or x.diag == nil) then - error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x") - end - -- process input parameters - local config = config or {} - local state = state or config - local xmean = x:clone():view(-1) -- distribution mean, a flattened copy - local N = xmean:size(1) -- number of objective variables/problem dimension - local sigma = state.sigma -- coordinate wise standard deviation (step size) - local ftarget = state.ftarget -- stop if fitness < ftarget - local maxEval = tonumber(state.maxEval) or 1e3*N^2 - local objfunc = opfunc - local verb_disp = state.verb_disp -- display step size - local min_iterations = state.min_iterations or 1 - - local lambda = state.popsize -- population size, offspring number - -- Strategy parameter setting: Selection - if state.popsize == nil then - lambda = 4 + math.floor(3 * math.log(N)) - end - - local mu = lambda / 2 -- number of parents/points for recombination - local weights = torch.range(0,mu-1):apply(function(i) - return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights - weights:div(weights:sum()) -- normalize recombination weights array - local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i - weights = weights:typeAs(x) - - -- Strategy parameter setting: Adaptation - local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N) -- time constant for cumulation for C - local cs = (mueff + 2) / (N + mueff + 5) -- t-const for cumulation for sigma control - local c1 = 2 / ((N + 1.3)^2 + mueff) -- learning rate for rank-one update of C - local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update - local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1 - - -- Initialize dynamic (internal) state variables - local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C - local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma - local B = torch.eye(N):typeAs(x) -- B defines the coordinate system - local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling - local C = torch.eye(N):typeAs(x) -- covariance matrix - if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig - error('torch.symeig not available for ' .. x:type() .. - " please use Float- or DoubleTensor for x") - end - local candidates = torch.Tensor(lambda,N):typeAs(x) - local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2 - local eigeneval = 0 -- tracking the update of B and D - local counteval = 0 - local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination - local fitvals = torch.Tensor(lambda)-- fitness values - local best = BestSolution.new(nil,nil,counteval) - local iteration = 0 -- iteration of the optimize loop - - - local function ask() - --[[return a list of lambda candidate solutions according to - m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I) - --]] - -- Eigendecomposition: first update B, D and invsqrtC from C - -- postpone in case to achieve O(N^2) - if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then - eigeneval = counteval - C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry - D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3) - D = torch.sqrt(D) -- D contains standard deviations now - invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t()) - end - for k=1,lambda do --repeat lambda times - local z = D:clone():normal(0,1):cmul(D) - candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma) - end - - return candidates - end - - - local function tell(arx) - --[[update the evolution paths and the distribution parameters m, - sigma, and C within CMA-ES. - - Parameters - ---------- - `arx` - a list of solutions, presumably from `ask()` - `fitvals` - the corresponding objective function values --]] - -- bookkeeping, preparation - counteval = counteval + lambda -- slightly artificial to do here - local xold = xmean:clone() - - -- Sort by fitness and compute weighted mean into xmean - local arindex = nil --sorted indices - fitvals, arindex = torch.sort(fitvals) - arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions - - table.insert(f_hist, fitvals[1]) --append best fitness to history - best:update(arx[1], fitvals[1], counteval) - - xmean:zero() - xmean:addmv(arx:t(), weights) --dot product - - -- Cumulation: update evolution paths - local y = xmean - xold - local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold) - - local c = (cs * (2-cs) * mueff)^0.5 / sigma - ps = ps - ps * cs + z * c -- exponential decay on ps - local hsig = (torch.sum(torch.pow(ps,2)) / - (1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1)) - hsig = hsig and 1.0 or 0.0 --use binary numbers - - c = (cc * (2-cc) * mueff)^0.5 / sigma - pc = pc - pc * cc + y * c * hsig -- exponential decay on pc - - -- Adapt covariance matrix C - local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc) - -- for a minor adjustment to the variance loss by hsig - for i=1,N do - for j=1,N do - local r = torch.range(1,mu) - r:apply(function(k) - return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end) - local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update - C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] + - c1 * pc[i]*pc[j] + cmu * Cmuij) - end - end - - -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82 - sigma = sigma * math.exp(math.min(0.6, - (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2)) - end - - local function stop() - --[[return satisfied termination conditions in a table like - {'termination reason':value, ...}, for example {'tolfun':1e-12}, - or the empty table {}--]] - local res = {} - if counteval > 0 then - if counteval >= maxEval then - res['evals'] = maxEval - end - if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then - res['ftarget'] = ftarget - end - if torch.max(D) > 1e7 * torch.min(D) then - res['condition'] = 1e7 - end - if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then - res['tolfun'] = 1e-12 - end - if sigma * torch.max(D) < 1e-11 then - -- remark: max(D) >= max(diag(C))^0.5 - res['tolx'] = 1e-11 - end - end - return res - end - - local function disp(verb_modulo) - --[[display some iteration info--]] - if verb_disp == 0 then - return nil - end - local iteration = counteval / lambda - - if iteration == 1 or iteration % (10*verb_modulo) == 0 then - print('evals:\t ax-ratio max(std) f-value') - end - if iteration <= 2 or iteration % verb_modulo == 0 then - local max_std = math.sqrt(torch.max(torch.diag(C))) - print(tostring(counteval).. ': ' .. - string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std) - .. tostring(fitvals[1])) - end - - return nil - end - - while next(stop()) == nil or iteration < min_iterations do - iteration = iteration + 1 - - local X = ask() -- deliver candidate solutions - for i=1, lambda do - -- put candidate tensor back in input shape and evaluate in opfunc - local candidate = X[i]:viewAs(x) - fitvals[i] = objfunc(candidate) - end - - tell(X) - disp(verb_disp) - end - - local bestmu, f, c = best:get() - if verb_disp > 0 then - for k, v in pairs(stop()) do - print('termination by', k, '=', v) - end - print('best f-value =', f) - print('solution = ') - print(bestmu) - print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration) - end - table.insert(f_hist, f) - - return bestmu, f_hist, counteval -end - - - -BestSolution.__index = BestSolution -function BestSolution.new(x, f, evals) - local self = setmetatable({}, BestSolution) - self.x = x - self.f = f - self.evals = evals - return self -end - -function BestSolution.update(self, arx, arf, evals) - --[[initialize the best solution with `x`, `f`, and `evals`. - Better solutions have smaller `f`-values.--]] - if self.f == nil or arf < self.f then - self.x = arx:clone() - self.f = arf - self.evals = evals - end - return self -end - -function BestSolution.get(self) - return self.x, self.f, self.evals -end diff --git a/contrib/torch/optim/de.lua b/contrib/torch/optim/de.lua deleted file mode 100644 index 1e8e8001d..000000000 --- a/contrib/torch/optim/de.lua +++ /dev/null @@ -1,109 +0,0 @@ ---[[ An implementation of `DE` (Differential Evolution), - -ARGS: - - -`opfunc` : a function that takes a single input (X), the point of - evaluation, and returns f(X) and df/dX. Note that df/dX is not used - -`x` : the initial point - -`state.popsize`: population size. If this is left empty, 10*d will be used - -`state.scaleFactor`: float, usually between 0.4 and 1 - -`state.crossoverRate`: float, usually between 0.1 and 0.9 - -`state.maxEval`: int, maximal number of function evaluations - -RETURN: - - `x*` : the new `x` vector, at the optimal point - - `f` : a table of all function values: - `f[1]` is the value of the function before any optimization and - `f[#f]` is the final fully optimized value, at `x*` -]] - -require 'torch' - -function optim.de(opfunc, x, config, state) - -- process input parameters - local config = config or {} - local state = state - local popsize = config.popsize -- population size - local scaleFactor = config.scaleFactor -- scale factor - local crossoverRate = config.crossoverRate -- crossover rate - local maxFEs = tonumber(config.maxFEs) -- maximal number of function evaluations - local maxRegion = config.maxRegion -- upper bound of search region - local minRegion = config.minRegion -- lower bound of search region - local xmean = x:clone():view(-1) -- distribution mean, a flattened copy - local D = xmean:size(1) -- number of objective variables/problem dimension - - if config.popsize == nil then - popsize = 10 * D - end - if config.maxRegion == nil then - maxRegion = 30 - end - if config.minRegion == nil then - minRegion = -30 - end - - -- Initialize population - local fx = x.new(maxFEs) - local pop = x.new(popsize, D) - local children = x.new(popsize, D) - local fitness = x.new(popsize) - local children_fitness = x.new(popsize) - local fes = 1 -- number of function evaluations - local best_fitness - local best_solution = x.new(D) - - -- Initialize population and evaluate the its fitness value - local gen = torch.Generator() - torch.manualSeed(gen, 1) - - pop:uniform(gen, minRegion, maxRegion) - for i = 1, popsize do - fitness[i] = opfunc(pop[i]) - fx[fes] = fitness[i] - fes = fes + 1 - end - - -- Find the best solution - local index - best_fitness, index = fitness:max(1) - best_fitness = best_fitness[1] - index = index[1] - best_solution:copy(pop[index]) - - -- Main loop - while fes < maxFEs do - local r1, r2 - for i = 1, popsize do - repeat - r1 = torch.random(gen, 1, popsize) - until(r1 ~= i) - repeat - r2 = torch.random(gen, 1, popsize) - until(r2 ~= r1 and r2 ~= i) - - local jrand = torch.random(gen, 1, D) - for j = 1, D do - if torch.uniform(gen, 0, 1) < crossoverRate or i == jrand then - children[i][j] = best_solution[j] + scaleFactor * (pop[r1][j] - pop[r2][j]) - else - children[i][j] = pop[i][j] - end - end - children_fitness[i] = opfunc(children[i]) - fx[fes] = children_fitness[i] - fes = fes + 1 - end - - for i = 1, popsize do - if children_fitness[i] <= fitness[i] then - pop[i]:copy(children[i]) - fitness[i] = children_fitness[i] - if fitness[i] < best_fitness then - best_fitness = fitness[i] - best_solution:copy(children[i]) - end - end - end - end - return best_solution, fx -end diff --git a/contrib/torch/optim/fista.lua b/contrib/torch/optim/fista.lua deleted file mode 100644 index c8c6f5e43..000000000 --- a/contrib/torch/optim/fista.lua +++ /dev/null @@ -1,192 +0,0 @@ ---[[ FISTA with backtracking line search - -- `f` : smooth function -- `g` : non-smooth function -- `pl` : minimizer of intermediate problem Q(x,y) -- `xinit` : initial point -- `params` : table of parameters (**optional**) -- `params.L` : 1/(step size) for ISTA/FISTA iteration (0.1) -- `params.Lstep` : step size multiplier at each iteration (1.5) -- `params.maxiter` : max number of iterations (50) -- `params.maxline` : max number of line search iterations per iteration (20) -- `params.errthres`: Error thershold for convergence check (1e-4) -- `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true) -- `params.verbose` : store each iteration solution and print detailed info (false) - -On output, `params` will contain these additional fields that can be reused. - -- `params.L` : last used L value will be written. - -These are temporary storages needed by the algo and if the same params object is -passed a second time, these same storages will be used without new allocation. - -- `params.xkm` : previous iterarion point -- `params.y` : fista iteration -- `params.ply` : ply = pl(y - 1/L grad(f)) - -Returns the solution x and history of {function evals, number of line search ,...} - -Algorithm is published in - - @article{beck-fista-09, - Author = {Beck, Amir and Teboulle, Marc}, - Journal = {SIAM J. Img. Sci.}, - Number = {1}, - Pages = {183--202}, - Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems}, - Volume = {2}, - Year = {2009}} -]] -function optim.FistaLS(f, g, pl, xinit, params) - - local params = params or {} - local L = params.L or 0.1 - local Lstep = params.Lstep or 1.5 - local maxiter = params.maxiter or 50 - local maxline = params.maxline or 20 - local errthres = params.errthres or 1e-4 - local doFistaUpdate = params.doFistaUpdate - local verbose = params.verbose - - -- temporary allocations - params.xkm = params.xkm or torch.Tensor() - params.y = params.y or torch.Tensor() - params.ply = params.ply or torch.Tensor() - local xkm = params.xkm -- previous iteration - local y = params.y -- fista iteration - local ply = params.ply -- soft shrinked y - - -- we start from all zeros - local xk = xinit - xkm:resizeAs(xk):zero() - ply:resizeAs(xk):zero() - y:resizeAs(xk):zero() - - local history = {} -- keep track of stuff - local niter = 0 -- number of iterations done - local converged = false -- are we done? - local tk = 1 -- momentum param for FISTA - local tkp = 0 - - - local gy = g(y) - local fval = math.huge -- fval = f+g - while not converged and niter < maxiter do - - -- run through smooth function (code is input, input is target) - -- get derivatives from smooth function - local fy,gfy = f(y,'dx') - --local gfy = f(y) - - local fply = 0 - local gply = 0 - local Q = 0 - - ---------------------------------------------- - -- do line search to find new current location starting from fista loc - local nline = 0 - local linesearchdone = false - while not linesearchdone do - -- take a step in gradient direction of smooth function - ply:copy(y) - ply:add(-1/L,gfy) - - -- and solve for minimum of auxiliary problem - pl(ply,L) - -- this is candidate for new current iteration - xk:copy(ply) - - -- evaluate this point F(ply) - fply = f(ply) - - -- ply - y - ply:add(-1, y) - -- <ply-y , \Grad(f(y))> - local Q2 = gfy:dot(ply) - -- L/2 ||beta-y||^2 - local Q3 = L/2 * ply:dot(ply) - -- Q(beta,y) = F(y) + <beta-y , \Grad(F(y))> + L/2||beta-y||^2 + G(beta) - Q = fy + Q2 + Q3 - - if verbose then - print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3)) - end - -- check if F(beta) < Q(pl(y),\t) - if fply <= Q then --and Fply + Gply <= F then - -- now evaluate G here - linesearchdone = true - elseif nline >= maxline then - linesearchdone = true - xk:copy(xkm) -- if we can't find a better point, current iter = previous iter - --print('oops') - else - L = L * Lstep - end - nline = nline + 1 - end - -- end line search - --------------------------------------------- - - --------------------------------------------- - -- FISTA - --------------------------------------------- - if doFistaUpdate then - -- do the FISTA step - tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2 - -- x(k-1) = x(k-1) - x(k) - xkm:add(-1,xk) - -- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k)) - y:copy(xk) - y:add( (1-tk)/tkp , xkm) - -- store for next iterations - -- x(k-1) = x(k) - xkm:copy(xk) - else - y:copy(xk) - end - -- t(k) = t(k+1) - tk = tkp - fply = f(y) - gply = g(y) - if verbose then - print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply)) - end - - niter = niter + 1 - - -- bookeeping - fval = fply + gply - history[niter] = {} - history[niter].nline = nline - history[niter].L = L - history[niter].F = fval - history[niter].Fply = fply - history[niter].Gply = gply - history[niter].Q = Q - params.L = L - if verbose then - history[niter].xk = xk:clone() - history[niter].y = y:clone() - end - - -- are we done? - if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then - converged = true - xinit:copy(y) - return y,history - end - - if niter >= maxiter then - xinit:copy(y) - return y,history - end - - --if niter > 1 and history[niter].F > history[niter-1].F then - --print(niter, 'This was supposed to be a convex function, we are going up') - --converged = true - --return xk,history - --end - end - error('not supposed to be here') -end - diff --git a/contrib/torch/optim/init.lua b/contrib/torch/optim/init.lua deleted file mode 100644 index a045bd8a2..000000000 --- a/contrib/torch/optim/init.lua +++ /dev/null @@ -1,33 +0,0 @@ - -require 'torch' - -optim = {} - --- optimizations -require('optim.sgd') -require('optim.cg') -require('optim.asgd') -require('optim.nag') -require('optim.fista') -require('optim.lbfgs') -require('optim.adagrad') -require('optim.rprop') -require('optim.adam') -require('optim.adamax') -require('optim.rmsprop') -require('optim.adadelta') -require('optim.cmaes') -require('optim.de') - --- line search functions -require('optim.lswolfe') - --- helpers -require('optim.polyinterp') -require('optim.checkgrad') - --- tools -require('optim.ConfusionMatrix') -require('optim.Logger') - -return optim diff --git a/contrib/torch/optim/lbfgs.lua b/contrib/torch/optim/lbfgs.lua deleted file mode 100644 index d850fcbb3..000000000 --- a/contrib/torch/optim/lbfgs.lua +++ /dev/null @@ -1,268 +0,0 @@ ---[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt) - -This implementation of L-BFGS relies on a user-provided line -search function (state.lineSearch). If this function is not -provided, then a simple learningRate is used to produce fixed -size steps. Fixed size steps are much less costly than line -searches, and can be useful for stochastic problems. - -The learning rate is used even when a line search is provided. -This is also useful for large-scale stochastic problems, where -opfunc is a noisy approximation of f(x). In that case, the learning -rate allows a reduction of confidence in the step size. - -ARGS: - -- `opfunc` : a function that takes a single input (X), the point of - evaluation, and returns f(X) and df/dX -- `x` : the initial point -- `state` : a table describing the state of the optimizer; after each - call the state is modified -- `state.maxIter` : Maximum number of iterations allowed -- `state.maxEval` : Maximum number of function evaluations -- `state.tolFun` : Termination tolerance on the first-order optimality -- `state.tolX` : Termination tol on progress in terms of func/param changes -- `state.lineSearch` : A line search function -- `state.learningRate` : If no line search provided, then a fixed step size is used - -RETURN: -- `x*` : the new `x` vector, at the optimal point -- `f` : a table of all function values: - `f[1]` is the value of the function before any optimization and - `f[#f]` is the final fully optimized value, at `x*` - -(Clement Farabet, 2012) -]] -function optim.lbfgs(opfunc, x, config, state) - -- get/update state - local config = config or {} - local state = state or config - local maxIter = tonumber(config.maxIter) or 20 - local maxEval = tonumber(config.maxEval) or maxIter*1.25 - local tolFun = config.tolFun or 1e-5 - local tolX = config.tolX or 1e-9 - local nCorrection = config.nCorrection or 100 - local lineSearch = config.lineSearch - local lineSearchOpts = config.lineSearchOptions - local learningRate = config.learningRate or 1 - local isverbose = config.verbose or false - - state.funcEval = state.funcEval or 0 - state.nIter = state.nIter or 0 - - -- verbose function - local verbose - if isverbose then - verbose = function(...) print('<optim.lbfgs> ', ...) end - else - verbose = function() end - end - - -- import some functions - local abs = math.abs - local min = math.min - - -- evaluate initial f(x) and df/dx - local f,g = opfunc(x) - local f_hist = {f} - local currentFuncEval = 1 - state.funcEval = state.funcEval + 1 - local p = g:size(1) - - -- check optimality of initial point - state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1 - tmp1:copy(g):abs() - if tmp1:sum() <= tolFun then - -- optimality condition below tolFun - verbose('optimality condition below tolFun') - return x,f_hist - end - - if not state.dir_bufs then - -- reusable buffers for y's and s's, and their histories - verbose('creating recyclable direction/step/history buffers') - state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1) - state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1) - for i=1,#state.dir_bufs do - state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1) - state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1) - end - end - - -- variables cached in state (for tracing) - local d = state.d - local t = state.t - local old_dirs = state.old_dirs - local old_stps = state.old_stps - local Hdiag = state.Hdiag - local g_old = state.g_old - local f_old = state.f_old - - -- optimize for a max of maxIter iterations - local nIter = 0 - while nIter < maxIter do - -- keep track of nb of iterations - nIter = nIter + 1 - state.nIter = state.nIter + 1 - - ------------------------------------------------------------ - -- compute gradient descent direction - ------------------------------------------------------------ - if state.nIter == 1 then - d = g:clone():mul(-1) -- -g - old_dirs = {} - old_stps = {} - Hdiag = 1 - else - -- do lbfgs update (update memory) - local y = table.remove(state.dir_bufs) -- pop - local s = table.remove(state.stp_bufs) - y:add(g, -1, g_old) -- g - g_old - s:mul(d, t) -- d*t - local ys = y:dot(s) -- y*s - if ys > 1e-10 then - -- updating memory - if #old_dirs == nCorrection then - -- shift history by one (limited-memory) - local removed1 = table.remove(old_dirs, 1) - local removed2 = table.remove(old_stps, 1) - table.insert(state.dir_bufs, removed1) - table.insert(state.stp_bufs, removed2) - end - - -- store new direction/step - table.insert(old_dirs, s) - table.insert(old_stps, y) - - -- update scale of initial Hessian approximation - Hdiag = ys / y:dot(y) -- (y*y) - else - -- put y and s back into the buffer pool - table.insert(state.dir_bufs, y) - table.insert(state.stp_bufs, s) - end - - -- compute the approximate (L-BFGS) inverse Hessian - -- multiplied by the gradient - local k = #old_dirs - - -- need to be accessed element-by-element, so don't re-type tensor: - state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro - for i = 1,k do - ro[i] = 1 / old_stps[i]:dot(old_dirs[i]) - end - - -- iteration in L-BFGS loop collapsed to use just one buffer - local q = tmp1 -- reuse tmp1 for the q buffer - -- need to be accessed element-by-element, so don't re-type tensor: - state.al = state.al or torch.zeros(nCorrection) local al = state.al - - q:mul(g, -1) -- -g - for i = k,1,-1 do - al[i] = old_dirs[i]:dot(q) * ro[i] - q:add(-al[i], old_stps[i]) - end - - -- multiply by initial Hessian - r = d -- share the same buffer, since we don't need the old d - r:mul(q, Hdiag) -- q[1] * Hdiag - for i = 1,k do - local be_i = old_stps[i]:dot(r) * ro[i] - r:add(al[i]-be_i, old_dirs[i]) - end - -- final direction is in r/d (same object) - end - g_old = g_old or g:clone() - g_old:copy(g) - f_old = f - - ------------------------------------------------------------ - -- compute step length - ------------------------------------------------------------ - -- directional derivative - local gtd = g:dot(d) -- g * d - - -- check that progress can be made along that direction - if gtd > -tolX then - break - end - - -- reset initial guess for step size - if state.nIter == 1 then - tmp1:copy(g):abs() - t = min(1,1/tmp1:sum()) * learningRate - else - t = learningRate - end - - -- optional line search: user function - local lsFuncEval = 0 - if lineSearch and type(lineSearch) == 'function' then - -- perform line search, using user function - f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts) - table.insert(f_hist, f) - else - -- no line search, simply move with fixed-step - x:add(t,d) - if nIter ~= maxIter then - -- re-evaluate function only if not in last iteration - -- the reason we do this: in a stochastic setting, - -- no use to re-evaluate that function here - f,g = opfunc(x) - lsFuncEval = 1 - table.insert(f_hist, f) - end - end - - -- update func eval - currentFuncEval = currentFuncEval + lsFuncEval - state.funcEval = state.funcEval + lsFuncEval - - ------------------------------------------------------------ - -- check conditions - ------------------------------------------------------------ - if nIter == maxIter then - -- no use to run tests - verbose('reached max number of iterations') - break - end - - if currentFuncEval >= maxEval then - -- max nb of function evals - verbose('max nb of function evals') - break - end - - tmp1:copy(g):abs() - if tmp1:sum() <= tolFun then - -- check optimality - verbose('optimality condition below tolFun') - break - end - - tmp1:copy(d):mul(t):abs() - if tmp1:sum() <= tolX then - -- step size below tolX - verbose('step size below tolX') - break - end - - if abs(f-f_old) < tolX then - -- function value changing less than tolX - verbose('function value changing less than tolX') - break - end - end - - -- save state - state.old_dirs = old_dirs - state.old_stps = old_stps - state.Hdiag = Hdiag - state.g_old = g_old - state.f_old = f_old - state.t = t - state.d = d - - -- return optimal x, and history of f(x) - return x,f_hist,currentFuncEval -end diff --git a/contrib/torch/optim/lswolfe.lua b/contrib/torch/optim/lswolfe.lua deleted file mode 100644 index 0afbdbe8b..000000000 --- a/contrib/torch/optim/lswolfe.lua +++ /dev/null @@ -1,192 +0,0 @@ ---[[ A Line Search satisfying the Wolfe conditions - -ARGS: -- `opfunc` : a function (the objective) that takes a single input (X), - the point of evaluation, and returns f(X) and df/dX -- `x` : initial point / starting location -- `t` : initial step size -- `d` : descent direction -- `f` : initial function value -- `g` : gradient at initial location -- `gtd` : directional derivative at starting location -- `options.c1` : sufficient decrease parameter -- `options.c2` : curvature parameter -- `options.tolX` : minimum allowable step length -- `options.maxIter` : maximum nb of iterations - -RETURN: -- `f` : function value at x+t*d -- `g` : gradient value at x+t*d -- `x` : the next x (=x+t*d) -- `t` : the step length -- `lsFuncEval` : the number of function evaluations -]] -function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options) - -- options - options = options or {} - local c1 = options.c1 or 1e-4 - local c2 = options.c2 or 0.9 - local tolX = options.tolX or 1e-9 - local maxIter = options.maxIter or 20 - local isverbose = options.verbose or false - - -- some shortcuts - local abs = torch.abs - local min = math.min - local max = math.max - - -- verbose function - local function verbose(...) - if isverbose then print('<optim.lswolfe> ', ...) end - end - - -- evaluate objective and gradient using initial step - local x_init = x:clone() - x:add(t,d) - local f_new,g_new = opfunc(x) - local lsFuncEval = 1 - local gtd_new = g_new * d - - -- bracket an interval containing a point satisfying the Wolfe - -- criteria - local LSiter,t_prev,done = 0,0,false - local f_prev,g_prev,gtd_prev = f,g:clone(),gtd - local bracket,bracketFval,bracketGval - while LSiter < maxIter do - -- check conditions: - if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then - bracket = x.new{t_prev,t} - bracketFval = x.new{f_prev,f_new} - bracketGval = x.new(2,g_new:size(1)) - bracketGval[1] = g_prev - bracketGval[2] = g_new - break - - elseif abs(gtd_new) <= -c2*gtd then - bracket = x.new{t} - bracketFval = x.new{f_new} - bracketGval = x.new(1,g_new:size(1)) - bracketGval[1] = g_new - done = true - break - - elseif gtd_new >= 0 then - bracket = x.new{t_prev,t} - bracketFval = x.new{f_prev,f_new} - bracketGval = x.new(2,g_new:size(1)) - bracketGval[1] = g_prev - bracketGval[2] = g_new - break - - end - - -- interpolate: - local tmp = t_prev - t_prev = t - local minStep = t + 0.01*(t-tmp) - local maxStep = t*10 - t = optim.polyinterp(x.new{{tmp,f_prev,gtd_prev}, - {t,f_new,gtd_new}}, - minStep, maxStep) - - -- next step: - f_prev = f_new - g_prev = g_new:clone() - gtd_prev = gtd_new - x[{}] = x_init - x:add(t,d) - f_new,g_new = opfunc(x) - lsFuncEval = lsFuncEval + 1 - gtd_new = g_new * d - LSiter = LSiter + 1 - end - - -- reached max nb of iterations? - if LSiter == maxIter then - bracket = x.new{0,t} - bracketFval = x.new{f,f_new} - bracketGval = x.new(2,g_new:size(1)) - bracketGval[1] = g - bracketGval[2] = g_new - end - - -- zoom phase: we now have a point satisfying the criteria, or - -- a bracket around it. We refine the bracket until we find the - -- exact point satisfying the criteria - local insufProgress = false - local LOposRemoved = 0 - while not done and LSiter < maxIter do - -- find high and low points in bracket - local f_LO,LOpos = bracketFval:min(1) - LOpos = LOpos[1] f_LO = f_LO[1] - local HIpos = -LOpos+3 - - -- compute new trial value - t = optim.polyinterp(x.new{{bracket[1],bracketFval[1],bracketGval[1]*d}, - {bracket[2],bracketFval[2],bracketGval[2]*d}}) - - -- test what we are making sufficient progress - if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then - if insufProgress or t>=bracket:max() or t <= bracket:min() then - if abs(t-bracket:max()) < abs(t-bracket:min()) then - t = bracket:max()-0.1*(bracket:max()-bracket:min()) - else - t = bracket:min()+0.1*(bracket:max()-bracket:min()) - end - insufProgress = false - else - insufProgress = true - end - else - insufProgress = false - end - - -- Evaluate new point - x[{}] = x_init - x:add(t,d) - f_new,g_new = opfunc(x) - lsFuncEval = lsFuncEval + 1 - gtd_new = g_new * d - LSiter = LSiter + 1 - if f_new > f + c1*t*gtd or f_new >= f_LO then - -- Armijo condition not satisfied or not lower than lowest point - bracket[HIpos] = t - bracketFval[HIpos] = f_new - bracketGval[HIpos] = g_new - else - if abs(gtd_new) <= - c2*gtd then - -- Wolfe conditions satisfied - done = true - elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then - -- Old HI becomes new LO - bracket[HIpos] = bracket[LOpos] - bracketFval[HIpos] = bracketFval[LOpos] - bracketGval[HIpos] = bracketGval[LOpos] - end - -- New point becomes new LO - bracket[LOpos] = t - bracketFval[LOpos] = f_new - bracketGval[LOpos] = g_new - end - - -- done? - if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then - break - end - end - - -- be verbose - if LSiter == maxIter then - verbose('reached max number of iterations') - end - - -- return stuff - local _,LOpos = bracketFval:min(1) - LOpos = LOpos[1] - t = bracket[LOpos] - f_new = bracketFval[LOpos] - g_new = bracketGval[LOpos] - x[{}] = x_init - x:add(t,d) - return f_new,g_new,x,t,lsFuncEval -end diff --git a/contrib/torch/optim/nag.lua b/contrib/torch/optim/nag.lua deleted file mode 100644 index 875d81e4c..000000000 --- a/contrib/torch/optim/nag.lua +++ /dev/null @@ -1,86 +0,0 @@ ----------------------------------------------------------------------- --- An implementation of SGD adapted with features of Nesterov's --- Accelerated Gradient method, based on the paper --- On the Importance of Initialization and Momentum in Deep Learning --- Sutsveker et. al., ICML 2013 --- --- ARGS: --- opfunc : a function that takes a single input (X), the point of --- evaluation, and returns f(X) and df/dX --- x : the initial point --- state : a table describing the state of the optimizer; after each --- call the state is modified --- state.learningRate : learning rate --- state.learningRateDecay : learning rate decay --- state.weightDecay : weight decay --- state.momentum : momentum --- state.learningRates : vector of individual learning rates --- --- RETURN: --- x : the new x vector --- f(x) : the function, evaluated before the update --- --- (Dilip Krishnan, 2013) --- - -function optim.nag(opfunc, x, config, state) - -- (0) get/update state - local config = config or {} - local state = state or config - local lr = config.learningRate or 1e-3 - local lrd = config.learningRateDecay or 0 - local wd = config.weightDecay or 0 - local mom = config.momentum or 0.9 - local damp = config.dampening or mom - local lrs = config.learningRates - state.evalCounter = state.evalCounter or 0 - local nevals = state.evalCounter - - if mom <= 0 then - error('Momentum must be positive for Nesterov Accelerated Gradient') - end - - -- (1) evaluate f(x) and df/dx - -- first step in the direction of the momentum vector - - if state.dfdx then - x:add(mom, state.dfdx) - end - -- then compute gradient at that point - -- comment out the above line to get the original SGD - local fx,dfdx = opfunc(x) - - -- (2) weight decay - if wd ~= 0 then - dfdx:add(wd, x) - end - - -- (3) learning rate decay (annealing) - local clr = lr / (1 + nevals*lrd) - - -- (4) apply momentum - if not state.dfdx then - state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0) - else - state.dfdx:mul(mom) - end - - -- (5) parameter update with single or individual learning rates - if lrs then - if not state.deltaParameters then - state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) - end - state.deltaParameters:copy(lrs):cmul(dfdx) - x:add(-clr, state.deltaParameters) - state.dfdx:add(-clr, state.deltaParameters) - else - x:add(-clr, dfdx) - state.dfdx:add(-clr, dfdx) - end - - -- (6) update evaluation counter - state.evalCounter = state.evalCounter + 1 - - -- return x, f(x) before optimization - return x,{fx} -end diff --git a/contrib/torch/optim/polyinterp.lua b/contrib/torch/optim/polyinterp.lua deleted file mode 100644 index c5026bf04..000000000 --- a/contrib/torch/optim/polyinterp.lua +++ /dev/null @@ -1,212 +0,0 @@ -local function isreal(x) - return x == x -end - -local function isnan(x) - return not x == x -end - -local function roots(c) - local tol=1e-12 - c[torch.lt(torch.abs(c),tol)]=0 - - local nonzero = torch.ne(c,0) - if nonzero:max() == 0 then - return 0 - end - - -- first non-zero - local _,pos = torch.max(nonzero,1) - pos = pos[1] - c=c[{ {pos,-1} }] - - local nz = 0 - for i=c:size(1),1,-1 do - if c[i] ~= 0 then - break - else - nz = nz + 1 - end - end - c=c[{ {1,c:size(1)-nz} }] - - local n = c:size(1)-1 - if n == 1 then - local e = c.new({{-c[2]/c[1], 0}}) - if nz > 0 then - return torch.cat(e, c.new(nz, 2):zero(), 1) - else - return e - end - elseif n > 1 then - local A = torch.diag(c.new(n-1):fill(1),-1) - A[1] = -c[{ {2,n+1} }]/c[1]; - local e = torch.eig(A,'N') - if nz > 0 then - return torch.cat(e, c.new(nz,2):zero(), 1) - else - return e - end - else - return c.new(nz,2):zero() - end -end - -local function real(x) - if type(x) == number then return x end - return x[{ {} , 1}] -end - -local function imag(x) - if type(x) == 'number' then return 0 end - if x:nDimension() == 1 then - return x.new(x:size(1)):zero() - else - return x[{ {}, 2}] - end -end - -local function polyval(p,x) - local pwr = p:size(1) - if type(x) == 'number' then - local val = 0 - p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end) - return val - else - local val = x.new(x:size(1)) - p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end) - return val - end -end - ----------------------------------------------------------------------- --- Minimum of interpolating polynomial based on function and --- derivative values --- --- ARGS: --- points : N triplets (x,f,g), must be a Tensor --- xmin : min value that brackets minimum (default: min of points) --- xmax : max value that brackets maximum (default: max of points) --- --- RETURN: --- minPos : position of minimum --- -function optim.polyinterp(points,xminBound,xmaxBound) - -- locals - local sqrt = torch.sqrt - local mean = torch.mean - local max = math.max - local min = math.min - - -- nb of points / order of polynomial - local nPoints = points:size(1) - local order = nPoints*2-1 - - -- returned values - local minPos - - -- Code for most common case: - -- + cubic interpolation of 2 points w/ function and derivative values for both - -- + no xminBound/xmaxBound - if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then - -- Solution in this case (where x2 is the farthest point): - -- d1 = g1 + g2 - 3*(f1-f2)/(x1-x2); - -- d2 = sqrt(d1^2 - g1*g2); - -- minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2)); - -- t_new = min(max(minPos,x1),x2); - local minVal,minPos = points[{ {},1 }]:min(1) - minVal = minVal[1] minPos = minPos[1] - local notMinPos = -minPos+3; - - local d1 = points[{minPos,3}] + points[{notMinPos,3}] - - 3*(points[{minPos,2}]-points[{notMinPos,2}]) - / (points[{minPos,1}]-points[{notMinPos,1}]); - local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]); - - if isreal(d2) then -- isreal() - local t = points[{notMinPos,1}] - (points[{notMinPos,1}] - - points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1) - / (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2)) - - minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}]) - else - minPos = mean(points[{{},1}]) - end - return minPos - end - - -- TODO: get the code below to work! - --error('<optim.polyinterp> extrapolation not implemented yet...') - - -- Compute Bounds of Interpolation Area - local xmin = points[{{},1}]:min() - local xmax = points[{{},1}]:max() - xminBound = xminBound or xmin - xmaxBound = xmaxBound or xmax - - -- Add constraints on function values - local A = points.new(nPoints*2,order+1):zero() - local b = points.new(nPoints*2,1):zero() - for i = 1,nPoints do - local constraint = points.new(order+1):zero() - for j = order,0,-1 do - constraint[order-j+1] = points[{i,1}]^j - end - A[i] = constraint - b[i] = points[{i,2}] - end - - -- Add constraints based on derivatives - for i = 1,nPoints do - local constraint = points.new(order+1):zero() - for j = 1,order do - constraint[j] = (order-j+1)*points[{i,1}]^(order-j) - end - A[nPoints+i] = constraint - b[nPoints+i] = points[{i,3}] - end - - -- Find interpolating polynomial - local res = torch.gels(b,A) - local params = res[{ {1,nPoints*2} }]:squeeze() - - params[torch.le(torch.abs(params),1e-12)]=0 - - -- Compute Critical Points - local dParams = points.new(order):zero(); - for i = 1,params:size(1)-1 do - dParams[i] = params[i]*(order-i+1) - end - - -- nan/inf? - local nans = false - if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then - nans = true - end - - local cp = torch.cat(points.new{xminBound,xmaxBound},points[{{},1}]) - if not nans then - local cproots = roots(dParams) - local cpi = points.new(cp:size(1),2):zero() - cpi[{ {1,cp:size(1)} , 1 }] = cp - cp = torch.cat(cpi,cproots,1) - end - - -- Test Critical Points - local fmin = math.huge - -- Default to Bisection if no critical points valid: - minPos = (xminBound+xmaxBound)/2 - for i = 1,cp:size(1) do - local xCP = cp[{ {i,i} , {} }] - local ixCP = imag(xCP)[1] - local rxCP = real(xCP)[1] - if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then - local fCP = polyval(params,rxCP) - if fCP < fmin then - minPos = rxCP - fmin = fCP - end - end - end - return minPos,fmin -end diff --git a/contrib/torch/optim/rmsprop.lua b/contrib/torch/optim/rmsprop.lua deleted file mode 100644 index aa562006a..000000000 --- a/contrib/torch/optim/rmsprop.lua +++ /dev/null @@ -1,58 +0,0 @@ ---[[ An implementation of RMSprop - -ARGS: - -- 'opfunc' : a function that takes a single input (X), the point - of a evaluation, and returns f(X) and df/dX -- 'x' : the initial point -- 'config` : a table with configuration parameters for the optimizer -- 'config.learningRate' : learning rate -- 'config.alpha' : smoothing constant -- 'config.epsilon' : value with which to initialise m -- 'config.weightDecay' : weight decay -- 'state' : a table describing the state of the optimizer; - after each call the state is modified -- 'state.m' : leaky sum of squares of parameter gradients, -- 'state.tmp' : and the square root (with epsilon smoothing) - -RETURN: -- `x` : the new x vector -- `f(x)` : the function, evaluated before the update - -]] - -function optim.rmsprop(opfunc, x, config, state) - -- (0) get/update state - local config = config or {} - local state = state or config - local lr = config.learningRate or 1e-2 - local alpha = config.alpha or 0.99 - local epsilon = config.epsilon or 1e-8 - local wd = config.weightDecay or 0 - local mfill = config.initialMean or 0 - - -- (1) evaluate f(x) and df/dx - local fx, dfdx = opfunc(x) - - -- (2) weight decay - if wd ~= 0 then - dfdx:add(wd, x) - end - - -- (3) initialize mean square values and square gradient storage - if not state.m then - state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(mfill) - state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx) - end - - -- (4) calculate new (leaky) mean squared values - state.m:mul(alpha) - state.m:addcmul(1.0-alpha, dfdx, dfdx) - - -- (5) perform update - state.tmp:sqrt(state.m):add(epsilon) - x:addcdiv(-lr, dfdx, state.tmp) - - -- return x*, f(x) before optimization - return x, {fx} -end diff --git a/contrib/torch/optim/rprop.lua b/contrib/torch/optim/rprop.lua deleted file mode 100644 index d7af16429..000000000 --- a/contrib/torch/optim/rprop.lua +++ /dev/null @@ -1,103 +0,0 @@ ---[[ A plain implementation of RPROP - -ARGS: -- `opfunc` : a function that takes a single input (X), the point of - evaluation, and returns f(X) and df/dX -- `x` : the initial point -- `state` : a table describing the state of the optimizer; after each - call the state is modified -- `state.stepsize` : initial step size, common to all components -- `state.etaplus` : multiplicative increase factor, > 1 (default 1.2) -- `state.etaminus` : multiplicative decrease factor, < 1 (default 0.5) -- `state.stepsizemax` : maximum stepsize allowed (default 50) -- `state.stepsizemin` : minimum stepsize allowed (default 1e-6) -- `state.niter` : number of iterations (default 1) - -RETURN: -- `x` : the new x vector -- `f(x)` : the function, evaluated before the update - -(Martin Riedmiller, Koray Kavukcuoglu 2013) ---]] -function optim.rprop(opfunc, x, config, state) - if config == nil and state == nil then - print('no state table RPROP initializing') - end - -- (0) get/update state - local config = config or {} - local state = state or config - local stepsize = config.stepsize or 0.1 - local etaplus = config.etaplus or 1.2 - local etaminus = config.etaminus or 0.5 - local stepsizemax = config.stepsizemax or 50.0 - local stepsizemin = config.stepsizemin or 1E-06 - local niter = config.niter or 1 - - local hfx = {} - - for i=1,niter do - - -- (1) evaluate f(x) and df/dx - local fx,dfdx = opfunc(x) - - -- init temp storage - if not state.delta then - state.delta = dfdx.new(dfdx:size()):zero() - state.stepsize = dfdx.new(dfdx:size()):fill(stepsize) - state.sign = dfdx.new(dfdx:size()) - state.psign = torch.ByteTensor(dfdx:size()) - state.nsign = torch.ByteTensor(dfdx:size()) - state.zsign = torch.ByteTensor(dfdx:size()) - state.dminmax = torch.ByteTensor(dfdx:size()) - if torch.type(x)=='torch.CudaTensor' then - -- Push to GPU - state.psign = state.psign:cuda() - state.nsign = state.nsign:cuda() - state.zsign = state.zsign:cuda() - state.dminmax = state.dminmax:cuda() - end - end - - -- sign of derivative from last step to this one - torch.cmul(state.sign, dfdx, state.delta) - torch.sign(state.sign, state.sign) - - -- get indices of >0, <0 and ==0 entries - state.sign.gt(state.psign, state.sign, 0) - state.sign.lt(state.nsign, state.sign, 0) - state.sign.eq(state.zsign, state.sign, 0) - - -- get step size updates - state.sign[state.psign] = etaplus - state.sign[state.nsign] = etaminus - state.sign[state.zsign] = 1 - - -- update stepsizes with step size updates - state.stepsize:cmul(state.sign) - - -- threshold step sizes - -- >50 => 50 - state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax) - state.stepsize[state.dminmax] = stepsizemax - -- <1e-6 ==> 1e-6 - state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin) - state.stepsize[state.dminmax] = stepsizemin - - -- for dir<0, dfdx=0 - -- for dir>=0 dfdx=dfdx - dfdx[state.nsign] = 0 - -- state.sign = sign(dfdx) - torch.sign(state.sign,dfdx) - - -- update weights - x:addcmul(-1,state.sign,state.stepsize) - - -- update state.dfdx with current dfdx - state.delta:copy(dfdx) - - table.insert(hfx,fx) - end - - -- return x*, f(x) before optimization - return x,hfx -end diff --git a/contrib/torch/optim/sgd.lua b/contrib/torch/optim/sgd.lua deleted file mode 100644 index e21c696a6..000000000 --- a/contrib/torch/optim/sgd.lua +++ /dev/null @@ -1,90 +0,0 @@ ---[[ A plain implementation of SGD - -ARGS: - -- `opfunc` : a function that takes a single input (X), the point - of a evaluation, and returns f(X) and df/dX -- `x` : the initial point -- `config` : a table with configuration parameters for the optimizer -- `config.learningRate` : learning rate -- `config.learningRateDecay` : learning rate decay -- `config.weightDecay` : weight decay -- `config.weightDecays` : vector of individual weight decays -- `config.momentum` : momentum -- `config.dampening` : dampening for momentum -- `config.nesterov` : enables Nesterov momentum -- `config.learningRates` : vector of individual learning rates -- `state` : a table describing the state of the optimizer; after each - call the state is modified -- `state.evalCounter` : evaluation counter (optional: 0, by default) - -RETURN: -- `x` : the new x vector -- `f(x)` : the function, evaluated before the update - -(Clement Farabet, 2012) -]] -function optim.sgd(opfunc, x, config, state) - -- (0) get/update state - local config = config or {} - local state = state or config - local lr = config.learningRate or 1e-3 - local lrd = config.learningRateDecay or 0 - local wd = config.weightDecay or 0 - local mom = config.momentum or 0 - local damp = config.dampening or mom - local nesterov = config.nesterov or false - local lrs = config.learningRates - local wds = config.weightDecays - state.evalCounter = state.evalCounter or 0 - local nevals = state.evalCounter - assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening") - - -- (1) evaluate f(x) and df/dx - local fx,dfdx = opfunc(x) - - -- (2) weight decay with single or individual parameters - if wd ~= 0 then - dfdx:add(wd, x) - elseif wds then - if not state.decayParameters then - state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) - end - state.decayParameters:copy(wds):cmul(x) - dfdx:add(state.decayParameters) - end - - -- (3) apply momentum - if mom ~= 0 then - if not state.dfdx then - state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx) - else - state.dfdx:mul(mom):add(1-damp, dfdx) - end - if nesterov then - dfdx:add(mom, state.dfdx) - else - dfdx = state.dfdx - end - end - - -- (4) learning rate decay (annealing) - local clr = lr / (1 + nevals*lrd) - - -- (5) parameter update with single or individual learning rates - if lrs then - if not state.deltaParameters then - state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx) - end - state.deltaParameters:copy(lrs):cmul(dfdx) - x:add(-clr, state.deltaParameters) - else - x:add(-clr, dfdx) - end - - -- (6) update evaluation counter - state.evalCounter = state.evalCounter + 1 - - -- return x*, f(x) before optimization - return x,{fx} -end |