Browse Source

[Feature] Add torch-optim contrib package

tags/1.7.0
Vsevolod Stakhov 6 years ago
parent
commit
478687cd46

+ 1
- 0
CMakeLists.txt View File

@@ -1259,6 +1259,7 @@ IF(ENABLE_TORCH MATCHES "ON")
ADD_SUBDIRECTORY(contrib/torch/paths)
ADD_SUBDIRECTORY(contrib/torch/torch7)
ADD_SUBDIRECTORY(contrib/torch/nn)
ADD_SUBDIRECTORY(contrib/torch/optim)
ADD_SUBDIRECTORY(contrib/torch/decisiontree)
SET(WITH_TORCH 1)
ELSE()

+ 5
- 0
contrib/torch/optim/CMakeLists.txt View File

@@ -0,0 +1,5 @@

CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
SET(src)
FILE(GLOB luasrc *.lua)
ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}")

+ 35
- 0
contrib/torch/optim/COPYRIGHT.txt View File

@@ -0,0 +1,35 @@
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

3. Neither the names of NEC Laboratories American and IDIAP Research
Institute nor the names of its contributors may be used to endorse or
promote products derived from this software without specific prior
written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

+ 361
- 0
contrib/torch/optim/ConfusionMatrix.lua View File

@@ -0,0 +1,361 @@
--[[ A Confusion Matrix class

Example:

conf = optim.ConfusionMatrix( {'cat','dog','person'} ) -- new matrix
conf:zero() -- reset matrix
for i = 1,N do
conf:add( neuralnet:forward(sample), label ) -- accumulate errors
end
print(conf) -- print matrix
image.display(conf:render()) -- render matrix
]]
local ConfusionMatrix = torch.class('optim.ConfusionMatrix')

function ConfusionMatrix:__init(nclasses, classes)
if type(nclasses) == 'table' then
classes = nclasses
nclasses = #classes
end
self.mat = torch.LongTensor(nclasses,nclasses):zero()
self.valids = torch.FloatTensor(nclasses):zero()
self.unionvalids = torch.FloatTensor(nclasses):zero()
self.nclasses = nclasses
self.totalValid = 0
self.averageValid = 0
self.classes = classes or {}
-- buffers
self._mat_flat = self.mat:view(-1)
self._target = torch.FloatTensor()
self._prediction = torch.FloatTensor()
self._max = torch.FloatTensor()
self._pred_idx = torch.LongTensor()
self._targ_idx = torch.LongTensor()
end

-- takes scalar prediction and target as input
function ConfusionMatrix:_add(p, t)
assert(p and type(p) == 'number')
assert(t and type(t) == 'number')
-- non-positive values are considered missing
-- and therefore ignored
if t > 0 then
self.mat[t][p] = self.mat[t][p] + 1
end
end

function ConfusionMatrix:add(prediction, target)
if type(prediction) == 'number' then
-- comparing numbers
self:_add(prediction, target)
else
self._prediction:resize(prediction:size()):copy(prediction)
assert(prediction:dim() == 1)
if type(target) == 'number' then
-- prediction is a vector, then target assumed to be an index
self._max:max(self._pred_idx, self._prediction, 1)
self:_add(self._pred_idx[1], target)
else
-- both prediction and target are vectors
assert(target:dim() == 1)
self._target:resize(target:size()):copy(target)
self._max:max(self._targ_idx, self._target, 1)
self._max:max(self._pred_idx, self._prediction, 1)
self:_add(self._pred_idx[1], self._targ_idx[1])
end
end
end

function ConfusionMatrix:batchAdd(predictions, targets)
local preds, targs, __
self._prediction:resize(predictions:size()):copy(predictions)
if predictions:dim() == 1 then
-- predictions is a vector of classes
preds = self._prediction
elseif predictions:dim() == 2 then
-- prediction is a matrix of class likelihoods
if predictions:size(2) == 1 then
-- or prediction just needs flattening
preds = self._prediction:select(2,1)
else
self._max:max(self._pred_idx, self._prediction, 2)
preds = self._pred_idx:select(2,1)
end
else
error("predictions has invalid number of dimensions")
end

self._target:resize(targets:size()):copy(targets)
if targets:dim() == 1 then
-- targets is a vector of classes
targs = self._target
elseif targets:dim() == 2 then
-- targets is a matrix of one-hot rows
if targets:size(2) == 1 then
-- or targets just needs flattening
targs = self._target:select(2,1)
else
self._max:max(self._targ_idx, self._target, 2)
targs = self._targ_idx:select(2,1)
end
else
error("targets has invalid number of dimensions")
end

-- non-positive values are considered missing and therefore ignored
local mask = targs:ge(1)
targs = targs[mask]
preds = preds[mask]

self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility

preds = preds:typeAs(targs)

assert(self.mat:isContiguous() and self.mat:stride(2) == 1)
local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat)
local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1))
self._mat_flat:indexAdd(1, indices, ones)
end

function ConfusionMatrix:zero()
self.mat:zero()
self.valids:zero()
self.unionvalids:zero()
self.totalValid = 0
self.averageValid = 0
end

local function isNaN(number)
return number ~= number
end

function ConfusionMatrix:updateValids()
local total = 0
for t = 1,self.nclasses do
self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum()
self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t])
total = total + self.mat[t][t]
end
self.totalValid = total / self.mat:sum()
self.averageValid = 0
self.averageUnionValid = 0
local nvalids = 0
local nunionvalids = 0
for t = 1,self.nclasses do
if not isNaN(self.valids[t]) then
self.averageValid = self.averageValid + self.valids[t]
nvalids = nvalids + 1
end
if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then
self.averageUnionValid = self.averageUnionValid + self.unionvalids[t]
nunionvalids = nunionvalids + 1
end
end
self.averageValid = self.averageValid / nvalids
self.averageUnionValid = self.averageUnionValid / nunionvalids
end

-- Calculating FAR/FRR associated with the confusion matrix

function ConfusionMatrix:farFrr()
local cmat = self.mat
local noOfClasses = cmat:size()[1]
self._frrs = self._frrs or torch.zeros(noOfClasses)
self._frrs:zero()
self._classFrrs = self._classFrrs or torch.zeros(noOfClasses)
self._classFrrs:zero()
self._classFrrs:add(-1)
self._fars = self._fars or torch.zeros(noOfClasses)
self._fars:zero()
self._classFars = self._classFars or torch.zeros(noOfClasses)
self._classFars:zero()
self._classFars:add(-1)
local classSamplesCount = cmat:sum(2)
local indx = 1
for i=1,noOfClasses do
if classSamplesCount[i][1] ~= 0 then
self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1]
self._classFrrs[i] = self._frrs[indx]
-- Calculating FARs
local farNumerator = 0
local farDenominator = 0
for j=1, noOfClasses do
if i ~= j then
if classSamplesCount[j][1] ~= 0 then
farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1]
farDenominator = farDenominator + 1
end
end
end
self._fars[indx] = farNumerator/farDenominator
self._classFars[i] = self._fars[indx]
indx = indx + 1
end
end
indx = indx - 1
local returnFrrs = self._frrs[{{1, indx}}]
local returnFars = self._fars[{{1, indx}}]
return self._classFrrs, self._classFars, returnFrrs, returnFars
end

local function log10(n)
if math.log10 then
return math.log10(n)
else
return math.log(n) / math.log(10)
end
end

function ConfusionMatrix:__tostring__()
self:updateValids()
local str = {'ConfusionMatrix:\n'}
local nclasses = self.nclasses
table.insert(str, '[')
local maxCnt = self.mat:max()
local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt)))
for t = 1,nclasses do
local pclass = self.valids[t] * 100
pclass = string.format('%2.3f', pclass)
if t == 1 then
table.insert(str, '[')
else
table.insert(str, ' [')
end
for p = 1,nclasses do
table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p]))
end
if self.classes and self.classes[1] then
if t == nclasses then
table.insert(str, ']] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
else
table.insert(str, '] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
end
else
if t == nclasses then
table.insert(str, ']] ' .. pclass .. '% \n')
else
table.insert(str, '] ' .. pclass .. '% \n')
end
end
end
table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n')
table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n')
table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%')
return table.concat(str)
end

function ConfusionMatrix:render(sortmode, display, block, legendwidth)
-- args
local confusion = self.mat:double()
local classes = self.classes
local sortmode = sortmode or 'score' -- 'score' or 'occurrence'
local block = block or 25
local legendwidth = legendwidth or 200
local display = display or false

-- legends
local legend = {
['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]',
['occurrence'] = 'Confusion matrix [sorted by occurrences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]'
}

-- parse matrix / normalize / count scores
local diag = torch.FloatTensor(#classes)
local freqs = torch.FloatTensor(#classes)
local unconf = confusion
local confusion = confusion:clone()
local corrects = 0
local total = 0
for target = 1,#classes do
freqs[target] = confusion[target]:sum()
corrects = corrects + confusion[target][target]
total = total + freqs[target]
confusion[target]:div( math.max(confusion[target]:sum(),1) )
diag[target] = confusion[target][target]
end

-- accuracies
local accuracy = corrects / total * 100
local perclass = 0
local total = 0
for target = 1,#classes do
if confusion[target]:sum() > 0 then
perclass = perclass + diag[target]
total = total + 1
end
end
perclass = perclass / total * 100
freqs:div(unconf:sum())

-- sort matrix
if sortmode == 'score' then
_,order = torch.sort(diag,1,true)
elseif sortmode == 'occurrence' then
_,order = torch.sort(freqs,1,true)
else
error('sort mode must be one of: score | occurrence')
end

-- render matrix
local render = torch.zeros(#classes*block, #classes*block)
for target = 1,#classes do
for prediction = 1,#classes do
render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]]
end
end

-- add grid
for target = 1,#classes do
render[{ {target*block},{} }] = 0.1
render[{ {},{target*block} }] = 0.1
end

-- create rendering
require 'image'
require 'qtwidget'
require 'qttorch'
local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] )
image.display{image=render, win=win1}

-- add legend
for i in ipairs(classes) do
-- background cell
win1:setcolor{r=0,g=0,b=0}
win1:rectangle((#render)[2],(i-1)*block,legendwidth,block)
win1:fill()

-- %
win1:setfont(qt.QFont{serif=false, size=fontsize})
local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4
win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2}
win1:moveto((#render)[2]+10,i*block-block/3)
win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100))

-- legend
win1:setfont(qt.QFont{serif=false, size=fontsize})
local gscale = diag[order[i]]*0.8+0.2
win1:setcolor{r=gscale,g=gscale,b=gscale}
win1:moveto(120+(#render)[2]+10,i*block-block/3)
win1:show(classes[order[i]])

for j in ipairs(classes) do
-- scores
local score = confusion[order[j]][order[i]]
local gscale = (1-score)*(score*0.8+0.2)
win1:setcolor{r=gscale,g=gscale,b=gscale}
win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3)
win1:show(string.format('%02.0f',math.floor(score*100+0.5)))
end
end

-- generate tensor
local t = win1:image():toTensor()

-- display
if display then
image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)}
end

-- return rendering
return t
end

+ 190
- 0
contrib/torch/optim/Logger.lua View File

@@ -0,0 +1,190 @@
--[[ Logger: a simple class to log symbols during training,
and automate plot generation

Example:
logger = optim.Logger('somefile.log') -- file to save stuff

for i = 1,N do -- log some symbols during
train_error = ... -- training/testing
test_error = ...
logger:add{['training error'] = train_error,
['test error'] = test_error}
end

logger:style{['training error'] = '-', -- define styles for plots
['test error'] = '-'}
logger:plot() -- and plot

---- OR ---

logger = optim.Logger('somefile.log') -- file to save stuff
logger:setNames{'training error', 'test error'}

for i = 1,N do -- log some symbols during
train_error = ... -- training/testing
test_error = ...
logger:add{train_error, test_error}
end

logger:style{'-', '-'} -- define styles for plots
logger:plot() -- and plot

-----------

logger:setlogscale(true) -- enable logscale on Y-axis
logger:plot() -- and plot
]]
require 'xlua'
local Logger = torch.class('optim.Logger')

function Logger:__init(filename, timestamp)
if filename then
self.name = filename
os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"')
if timestamp then
-- append timestamp to create unique log file
filename = filename .. '-'..os.date("%Y_%m_%d_%X")
end
self.file = io.open(filename,'w')
self.epsfile = self.name .. '.eps'
else
self.file = io.stdout
self.name = 'stdout'
print('<Logger> warning: no path provided, logging to std out')
end
self.empty = true
self.symbols = {}
self.styles = {}
self.names = {}
self.idx = {}
self.figure = nil
self.showPlot = true
self.plotRawCmd = nil
self.defaultStyle = '+'
self.logscale = false
end

function Logger:setNames(names)
self.names = names
self.empty = false
self.nsymbols = #names
for k,key in pairs(names) do
self.file:write(key .. '\t')
self.symbols[k] = {}
self.styles[k] = {self.defaultStyle}
self.idx[key] = k
end
self.file:write('\n')
self.file:flush()
return self
end

function Logger:add(symbols)
-- (1) first time ? print symbols' names on first row
if self.empty then
self.empty = false
self.nsymbols = #symbols
for k,val in pairs(symbols) do
self.file:write(k .. '\t')
self.symbols[k] = {}
self.styles[k] = {self.defaultStyle}
self.names[k] = k
end
self.idx = self.names
self.file:write('\n')
end
-- (2) print all symbols on one row
for k,val in pairs(symbols) do
if type(val) == 'number' then
self.file:write(string.format('%11.4e',val) .. '\t')
elseif type(val) == 'string' then
self.file:write(val .. '\t')
else
xlua.error('can only log numbers and strings', 'Logger')
end
end
self.file:write('\n')
self.file:flush()
-- (3) save symbols in internal table
for k,val in pairs(symbols) do
table.insert(self.symbols[k], val)
end
end

function Logger:style(symbols)
for name,style in pairs(symbols) do
if type(style) == 'string' then
self.styles[name] = {style}
elseif type(style) == 'table' then
self.styles[name] = style
else
xlua.error('style should be a string or a table of strings','Logger')
end
end
return self
end

function Logger:setlogscale(state)
self.logscale = state
end

function Logger:display(state)
self.showPlot = state
end

function Logger:plot(...)
if not xlua.require('gnuplot') then
if not self.warned then
print('<Logger> warning: cannot plot with this version of Torch')
self.warned = true
end
return
end
local plotit = false
local plots = {}
local plotsymbol =
function(name,list)
if #list > 1 then
local nelts = #list
local plot_y = torch.Tensor(nelts)
for i = 1,nelts do
plot_y[i] = list[i]
end
for _,style in ipairs(self.styles[name]) do
table.insert(plots, {self.names[name], plot_y, style})
end
plotit = true
end
end
local args = {...}
if not args[1] then -- plot all symbols
for name,list in pairs(self.symbols) do
plotsymbol(name,list)
end
else -- plot given symbols
for _,name in ipairs(args) do
plotsymbol(self.idx[name], self.symbols[self.idx[name]])
end
end
if plotit then
if self.showPlot then
self.figure = gnuplot.figure(self.figure)
if self.logscale then gnuplot.logscale('on') end
gnuplot.plot(plots)
if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
gnuplot.grid('on')
gnuplot.title('<Logger::' .. self.name .. '>')
end
if self.epsfile then
os.execute('rm -f "' .. self.epsfile .. '"')
local epsfig = gnuplot.epsfigure(self.epsfile)
if self.logscale then gnuplot.logscale('on') end
gnuplot.plot(plots)
if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
gnuplot.grid('on')
gnuplot.title('<Logger::' .. self.name .. '>')
gnuplot.plotflush()
gnuplot.close(epsfig)
end
end
end

+ 55
- 0
contrib/torch/optim/adadelta.lua View File

@@ -0,0 +1,55 @@
--[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701

ARGS:
- `opfunc` : a function that takes a single input (X), the point of
evaluation, and returns f(X) and df/dX
- `x` : the initial point
- `config` : a table of hyper-parameters
- `config.rho` : interpolation parameter
- `config.eps` : for numerical stability
- `config.weightDecay` : weight decay
- `state` : a table describing the state of the optimizer; after each
call the state is modified
- `state.paramVariance` : vector of temporal variances of parameters
- `state.accDelta` : vector of accummulated delta of gradients
RETURN:
- `x` : the new x vector
- `f(x)` : the function, evaluated before the update
]]
function optim.adadelta(opfunc, x, config, state)
-- (0) get/update state
if config == nil and state == nil then
print('no state table, ADADELTA initializing')
end
local config = config or {}
local state = state or config
local rho = config.rho or 0.9
local eps = config.eps or 1e-6
local wd = config.weightDecay or 0
state.evalCounter = state.evalCounter or 0
-- (1) evaluate f(x) and df/dx
local fx,dfdx = opfunc(x)

-- (2) weight decay
if wd ~= 0 then
dfdx:add(wd, x)
end

-- (3) parameter update
if not state.paramVariance then
state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
end
state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx)
state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt()
state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx)
x:add(-1, state.delta)
state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta)
-- (4) update evaluation counter
state.evalCounter = state.evalCounter + 1

-- return x*, f(x) before optimization
return x,{fx}
end

+ 55
- 0
contrib/torch/optim/adagrad.lua View File

@@ -0,0 +1,55 @@
--[[ ADAGRAD implementation for SGD

ARGS:
- `opfunc` : a function that takes a single input (X), the point of
evaluation, and returns f(X) and df/dX
- `x` : the initial point
- `state` : a table describing the state of the optimizer; after each
call the state is modified
- `state.learningRate` : learning rate
- `state.paramVariance` : vector of temporal variances of parameters
- `state.weightDecay` : scalar that controls weight decay
RETURN:
- `x` : the new x vector
- `f(x)` : the function, evaluated before the update

]]
function optim.adagrad(opfunc, x, config, state)
-- (0) get/update state
if config == nil and state == nil then
print('no state table, ADAGRAD initializing')
end
local config = config or {}
local state = state or config
local lr = config.learningRate or 1e-3
local lrd = config.learningRateDecay or 0
local wd = config.weightDecay or 0
state.evalCounter = state.evalCounter or 0
local nevals = state.evalCounter

-- (1) evaluate f(x) and df/dx
local fx,dfdx = opfunc(x)

-- (2) weight decay with a single parameter
if wd ~= 0 then
dfdx:add(wd, x)
end

-- (3) learning rate decay (annealing)
local clr = lr / (1 + nevals*lrd)

-- (4) parameter update with single or individual learning rates
if not state.paramVariance then
state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx)
end
state.paramVariance:addcmul(1,dfdx,dfdx)
state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt()
x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10))

-- (5) update evaluation counter
state.evalCounter = state.evalCounter + 1

-- return x*, f(x) before optimization
return x,{fx}
end

+ 72
- 0
contrib/torch/optim/adam.lua View File

@@ -0,0 +1,72 @@
--[[ An implementation of Adam https://arxiv.org/abs/1412.6980

ARGS:

- 'opfunc' : a function that takes a single input (X), the point
of a evaluation, and returns f(X) and df/dX
- 'x' : the initial point
- 'config` : a table with configuration parameters for the optimizer
- 'config.learningRate' : learning rate
- `config.learningRateDecay` : learning rate decay
- 'config.beta1' : first moment coefficient
- 'config.beta2' : second moment coefficient
- 'config.epsilon' : for numerical stability
- 'config.weightDecay' : weight decay
- 'state' : a table describing the state of the optimizer; after each
call the state is modified

RETURN:
- `x` : the new x vector
- `f(x)` : the function, evaluated before the update

]]

function optim.adam(opfunc, x, config, state)
-- (0) get/update state
local config = config or {}
local state = state or config
local lr = config.learningRate or 0.001
local lrd = config.learningRateDecay or 0

local beta1 = config.beta1 or 0.9
local beta2 = config.beta2 or 0.999
local epsilon = config.epsilon or 1e-8
local wd = config.weightDecay or 0

-- (1) evaluate f(x) and df/dx
local fx, dfdx = opfunc(x)

-- (2) weight decay
if wd ~= 0 then
dfdx:add(wd, x)
end

-- Initialization
state.t = state.t or 0
-- Exponential moving average of gradient values
state.m = state.m or x.new(dfdx:size()):zero()
-- Exponential moving average of squared gradient values
state.v = state.v or x.new(dfdx:size()):zero()
-- A tmp tensor to hold the sqrt(v) + epsilon
state.denom = state.denom or x.new(dfdx:size()):zero()

-- (3) learning rate decay (annealing)
local clr = lr / (1 + state.t*lrd)

state.t = state.t + 1

-- Decay the first and second moment running average coefficient
state.m:mul(beta1):add(1-beta1, dfdx)
state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)

state.denom:copy(state.v):sqrt():add(epsilon)

local biasCorrection1 = 1 - beta1^state.t
local biasCorrection2 = 1 - beta2^state.t
local stepSize = clr * math.sqrt(biasCorrection2)/biasCorrection1
-- (4) update x
x:addcdiv(-stepSize, state.m, state.denom)

-- return x*, f(x) before optimization
return x, {fx}
end

+ 66
- 0
contrib/torch/optim/adamax.lua View File

@@ -0,0 +1,66 @@
--[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf

ARGS:

- 'opfunc' : a function that takes a single input (X), the point
of a evaluation, and returns f(X) and df/dX
- 'x' : the initial point
- 'config` : a table with configuration parameters for the optimizer
- 'config.learningRate' : learning rate
- 'config.beta1' : first moment coefficient
- 'config.beta2' : second moment coefficient
- 'config.epsilon' : for numerical stability
- 'state' : a table describing the state of the optimizer;
after each call the state is modified.

RETURN:
- `x` : the new x vector
- `f(x)` : the function, evaluated before the update

]]

function optim.adamax(opfunc, x, config, state)
-- (0) get/update state
local config = config or {}
local state = state or config
local lr = config.learningRate or 0.002

local beta1 = config.beta1 or 0.9
local beta2 = config.beta2 or 0.999
local epsilon = config.epsilon or 1e-38
local wd = config.weightDecay or 0

-- (1) evaluate f(x) and df/dx
local fx, dfdx = opfunc(x)

-- (2) weight decay
if wd ~= 0 then
dfdx:add(wd, x)
end

-- Initialization
state.t = state.t or 0
-- Exponential moving average of gradient values
state.m = state.m or x.new(dfdx:size()):zero()
-- Exponential moving average of the infinity norm
state.u = state.u or x.new(dfdx:size()):zero()
-- A tmp tensor to hold the input to max()
state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero()

state.t = state.t + 1

-- Update biased first moment estimate.
state.m:mul(beta1):add(1-beta1, dfdx)
-- Update the exponentially weighted infinity norm.
state.max[1]:copy(state.u):mul(beta2)
state.max[2]:copy(dfdx):abs():add(epsilon)
state.u:max(state.max, 1)

local biasCorrection1 = 1 - beta1^state.t
local stepSize = lr/biasCorrection1
-- (2) update x
x:addcdiv(-stepSize, state.m, state.u)

-- return x*, f(x) before optimization
return x, {fx}
end

+ 73
- 0
contrib/torch/optim/asgd.lua View File

@@ -0,0 +1,73 @@
--[[ An implementation of ASGD

ASGD:

x := (1 - lambda eta_t) x - eta_t df/dx(z,x)
a := a + mu_t [ x - a ]

eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
mu_t = 1/max(1,t-t0)

implements ASGD algoritm as in L.Bottou's sgd-2.0

ARGS:

- `opfunc` : a function that takes a single input (X), the point of
evaluation, and returns f(X) and df/dX
- `x` : the initial point
- `state` : a table describing the state of the optimizer; after each
call the state is modified
- `state.eta0` : learning rate
- `state.lambda` : decay term
- `state.alpha` : power for eta update
- `state.t0` : point at which to start averaging

RETURN:
- `x` : the new x vector
- `f(x)` : the function, evaluated before the update
- `ax` : the averaged x vector

(Clement Farabet, 2012)
--]]
function optim.asgd(opfunc, x, config, state)
-- (0) get/update state
local config = config or {}
local state = state or config
config.eta0 = config.eta0 or 1e-4
config.lambda = config.lambda or 1e-4
config.alpha = config.alpha or 0.75
config.t0 = config.t0 or 1e6

-- (hidden state)
state.eta_t = state.eta_t or config.eta0
state.mu_t = state.mu_t or 1
state.t = state.t or 0

-- (1) evaluate f(x) and df/dx
local fx,dfdx = opfunc(x)

-- (2) decay term
x:mul(1 - config.lambda*state.eta_t)

-- (3) update x
x:add(-state.eta_t, dfdx)

-- (4) averaging
state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero()
state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax)
if state.mu_t ~= 1 then
state.tmp:copy(x)
state.tmp:add(-1,state.ax):mul(state.mu_t)
state.ax:add(state.tmp)
else
state.ax:copy(x)
end

-- (5) update eta_t and mu_t
state.t = state.t + 1
state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha)
state.mu_t = 1 / math.max(1, state.t - config.t0)

-- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...)
return x,{fx},state.ax
end

+ 208
- 0
contrib/torch/optim/cg.lua View File

@@ -0,0 +1,208 @@
--[[

This cg implementation is a rewrite of minimize.m written by Carl
E. Rasmussen. It is supposed to produce exactly same results (give
or take numerical accuracy due to some changed order of
operations). You can compare the result on rosenbrock with minimize.m.
http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html

[x fx c] = minimize([0 0]', 'rosenbrock', -25)

Note that we limit the number of function evaluations only, it seems much
more important in practical use.

ARGS:

- `opfunc` : a function that takes a single input, the point of evaluation.
- `x` : the initial point
- `state` : a table of parameters and temporary allocations.
- `state.maxEval` : max number of function evaluations
- `state.maxIter` : max number of iterations
- `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage
- `state.[s,x0]` : if you pass torch.Tensor they will be used for temp storage

RETURN:

- `x*` : the new x vector, at the optimal point
- `f` : a table of all function values where
`f[1]` is the value of the function before any optimization and
`f[#f]` is the final fully optimized value, at x*

(Koray Kavukcuoglu, 2012)
--]]
function optim.cg(opfunc, x, config, state)
-- parameters
local config = config or {}
local state = state or config
local rho = config.rho or 0.01
local sig = config.sig or 0.5
local int = config.int or 0.1
local ext = config.ext or 3.0
local maxIter = config.maxIter or 20
local ratio = config.ratio or 100
local maxEval = config.maxEval or maxIter*1.25
local red = 1

local verbose = config.verbose or 0

local i = 0
local ls_failed = 0
local fx = {}

-- we need three points for the interpolation/extrapolation stuff
local z1,z2,z3 = 0,0,0
local d1,d2,d3 = 0,0,0
local f1,f2,f3 = 0,0,0

local df1 = state.df1 or x.new()
local df2 = state.df2 or x.new()
local df3 = state.df3 or x.new()
local tdf

df1:resizeAs(x)
df2:resizeAs(x)
df3:resizeAs(x)

-- search direction
local s = state.s or x.new()
s:resizeAs(x)

-- we need a temp storage for X
local x0 = state.x0 or x.new()
local f0 = 0
local df0 = state.df0 or x.new()
x0:resizeAs(x)
df0:resizeAs(x)

-- evaluate at initial point
f1,tdf = opfunc(x)
fx[#fx+1] = f1
df1:copy(tdf)
i=i+1

-- initial search direction
s:copy(df1):mul(-1)

d1 = -s:dot(s ) -- slope
z1 = red/(1-d1) -- initial step

while i < math.abs(maxEval) do

x0:copy(x)
f0 = f1
df0:copy(df1)

x:add(z1,s)
f2,tdf = opfunc(x)
df2:copy(tdf)
i=i+1
d2 = df2:dot(s)
f3,d3,z3 = f1,d1,-z1 -- init point 3 equal to point 1
local m = math.min(maxIter,maxEval-i)
local success = 0
local limit = -1

while true do
while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do
limit = z1
if f2 > f1 then
z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3)
else
local A = 6*(f2-f3)/z3+3*(d2+d3)
local B = 3*(f3-f2)-z3*(d3+2*d2)
z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A
end
if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then
z2 = z3/2;
end
z2 = math.max(math.min(z2, int*z3),(1-int)*z3);
z1 = z1 + z2;
x:add(z2,s)
f2,tdf = opfunc(x)
df2:copy(tdf)
i=i+1
m = m - 1
d2 = df2:dot(s)
z3 = z3-z2;
end
if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then
break
elseif d2 > sig*d1 then
success = 1;
break;
elseif m == 0 then
break;
end
local A = 6*(f2-f3)/z3+3*(d2+d3);
local B = 3*(f3-f2)-z3*(d3+2*d2);
z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3))

if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then
if limit < -0.5 then
z2 = z1 * (ext -1)
else
z2 = (limit-z1)/2
end
elseif (limit > -0.5) and (z2+z1) > limit then
z2 = (limit-z1)/2
elseif limit < -0.5 and (z2+z1) > z1*ext then
z2 = z1*(ext-1)
elseif z2 < -z3*int then
z2 = -z3*int
elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then
z2 = (limit-z1)*(1-int)
end
f3=f2; d3=d2; z3=-z2;
z1 = z1+z2;
x:add(z2,s)

f2,tdf = opfunc(x)
df2:copy(tdf)
i=i+1
m = m - 1
d2 = df2:dot(s)
end
if success == 1 then
f1 = f2
fx[#fx+1] = f1;
local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1)
s:mul(ss)
s:add(-1,df2)
local tmp = df1:clone()
df1:copy(df2)
df2:copy(tmp)
d2 = df1:dot(s)
if d2> 0 then
s:copy(df1)
s:mul(-1)
d2 = -s:dot(s)
end

z1 = z1 * math.min(ratio, d1/(d2-1e-320))
d1 = d2
ls_failed = 0
else
x:copy(x0)
f1 = f0
df1:copy(df0)
if ls_failed or i>maxEval then
break
end
local tmp = df1:clone()
df1:copy(df2)
df2:copy(tmp)
s:copy(df1)
s:mul(-1)
d1 = -s:dot(s)
z1 = 1/(1-d1)
ls_failed = 1
end
end
state.df0 = df0
state.df1 = df1
state.df2 = df2
state.df3 = df3
state.x0 = x0
state.s = s
return x,fx,i
end

+ 52
- 0
contrib/torch/optim/checkgrad.lua View File

@@ -0,0 +1,52 @@
--[[ An implementation of a simple numerical gradient checker.

ARGS:

- `opfunc` : a function that takes a single input (X), the point of
evaluation, and returns f(X) and df/dX
- `x` : the initial point
- `eps` : the epsilon to use for the numerical check (default is 1e-7)

RETURN:

- `diff` : error in the gradient, should be near tol
- `dC` : exact gradient at point
- `dC_est` : numerically estimates gradient at point

]]--


-- function that numerically checks gradient of NCA loss:
function optim.checkgrad(opfunc, x, eps)

-- compute true gradient:
local Corg,dC = opfunc(x)
dC:resize(x:size())

local Ctmp -- temporary value
local isTensor = torch.isTensor(Corg)
if isTensor then
Ctmp = Corg.new(Corg:size())
end

-- compute numeric approximations to gradient:
local eps = eps or 1e-7
local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC)
for i = 1,dC:size(1) do
local tmp = x[i]
x[i] = x[i] + eps
local C1 = opfunc(x)
if isTensor then
Ctmp:copy(C1)
C1 = Ctmp
end
x[i] = x[i] - 2 * eps
local C2 = opfunc(x)
x[i] = tmp
dC_est[i] = (C1 - C2) / (2 * eps)
end

-- estimate error of gradient:
local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est)
return diff,dC,dC_est
end

+ 270
- 0
contrib/torch/optim/cmaes.lua View File

@@ -0,0 +1,270 @@
require 'torch'
require 'math'

local BestSolution = {}
--[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy),
ported from https://www.lri.fr/~hansen/barecmaes2.html.

Parameters
----------
ARGS:

- `opfunc` : a function that takes a single input (X), the point of
evaluation, and returns f(X) and df/dX. Note that df/dX is not used
- `x` : the initial point
- `state.sigma`
float, initial step-size (standard deviation in each
coordinate)
- `state.maxEval`
int, maximal number of function evaluations
- `state.ftarget`
float, target function value
- `state.popsize`
population size. If this is left empty,
4 + int(3 * log(|x|)) will be used
- `state.ftarget`
stop if fitness < ftarget
- `state.verb_disp`
int, display on console every verb_disp iteration, 0 for never

RETURN:
- `x*` : the new `x` vector, at the optimal point
- `f` : a table of all function values:
`f[1]` is the value of the function before any optimization and
`f[#f]` is the final fully optimized value, at `x*`
--]]
function optim.cmaes(opfunc, x, config, state)
if (x.triu == nil or x.diag == nil) then
error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x")
end
-- process input parameters
local config = config or {}
local state = state or config
local xmean = x:clone():view(-1) -- distribution mean, a flattened copy
local N = xmean:size(1) -- number of objective variables/problem dimension
local sigma = state.sigma -- coordinate wise standard deviation (step size)
local ftarget = state.ftarget -- stop if fitness < ftarget
local maxEval = tonumber(state.maxEval) or 1e3*N^2
local objfunc = opfunc
local verb_disp = state.verb_disp -- display step size
local min_iterations = state.min_iterations or 1

local lambda = state.popsize -- population size, offspring number
-- Strategy parameter setting: Selection
if state.popsize == nil then
lambda = 4 + math.floor(3 * math.log(N))
end

local mu = lambda / 2 -- number of parents/points for recombination
local weights = torch.range(0,mu-1):apply(function(i)
return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights
weights:div(weights:sum()) -- normalize recombination weights array
local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i
weights = weights:typeAs(x)

-- Strategy parameter setting: Adaptation
local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N) -- time constant for cumulation for C
local cs = (mueff + 2) / (N + mueff + 5) -- t-const for cumulation for sigma control
local c1 = 2 / ((N + 1.3)^2 + mueff) -- learning rate for rank-one update of C
local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update
local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1

-- Initialize dynamic (internal) state variables
local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C
local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma
local B = torch.eye(N):typeAs(x) -- B defines the coordinate system
local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling
local C = torch.eye(N):typeAs(x) -- covariance matrix
if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig
error('torch.symeig not available for ' .. x:type() ..
" please use Float- or DoubleTensor for x")
end
local candidates = torch.Tensor(lambda,N):typeAs(x)
local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2
local eigeneval = 0 -- tracking the update of B and D
local counteval = 0
local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination
local fitvals = torch.Tensor(lambda)-- fitness values
local best = BestSolution.new(nil,nil,counteval)
local iteration = 0 -- iteration of the optimize loop


local function ask()
--[[return a list of lambda candidate solutions according to
m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I)
--]]
-- Eigendecomposition: first update B, D and invsqrtC from C
-- postpone in case to achieve O(N^2)
if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then
eigeneval = counteval
C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry
D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3)
D = torch.sqrt(D) -- D contains standard deviations now
invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t())
end
for k=1,lambda do --repeat lambda times
local z = D:clone():normal(0,1):cmul(D)
candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma)
end

return candidates
end


local function tell(arx)
--[[update the evolution paths and the distribution parameters m,
sigma, and C within CMA-ES.

Parameters
----------
`arx`
a list of solutions, presumably from `ask()`
`fitvals`
the corresponding objective function values --]]
-- bookkeeping, preparation
counteval = counteval + lambda -- slightly artificial to do here
local xold = xmean:clone()

-- Sort by fitness and compute weighted mean into xmean
local arindex = nil --sorted indices
fitvals, arindex = torch.sort(fitvals)
arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions

table.insert(f_hist, fitvals[1]) --append best fitness to history
best:update(arx[1], fitvals[1], counteval)

xmean:zero()
xmean:addmv(arx:t(), weights) --dot product

-- Cumulation: update evolution paths
local y = xmean - xold
local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold)

local c = (cs * (2-cs) * mueff)^0.5 / sigma
ps = ps - ps * cs + z * c -- exponential decay on ps
local hsig = (torch.sum(torch.pow(ps,2)) /
(1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1))
hsig = hsig and 1.0 or 0.0 --use binary numbers

c = (cc * (2-cc) * mueff)^0.5 / sigma
pc = pc - pc * cc + y * c * hsig -- exponential decay on pc

-- Adapt covariance matrix C
local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc)
-- for a minor adjustment to the variance loss by hsig
for i=1,N do
for j=1,N do
local r = torch.range(1,mu)
r:apply(function(k)
return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end)
local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update
C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] +
c1 * pc[i]*pc[j] + cmu * Cmuij)
end
end

-- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82
sigma = sigma * math.exp(math.min(0.6,
(cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2))
end

local function stop()
--[[return satisfied termination conditions in a table like
{'termination reason':value, ...}, for example {'tolfun':1e-12},
or the empty table {}--]]
local res = {}
if counteval > 0 then
if counteval >= maxEval then
res['evals'] = maxEval
end
if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then
res['ftarget'] = ftarget
end
if torch.max(D) > 1e7 * torch.min(D) then
res['condition'] = 1e7
end
if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then
res['tolfun'] = 1e-12
end
if sigma * torch.max(D) < 1e-11 then
-- remark: max(D) >= max(diag(C))^0.5
res['tolx'] = 1e-11
end
end
return res
end

local function disp(verb_modulo)
--[[display some iteration info--]]
if verb_disp == 0 then
return nil
end
local iteration = counteval / lambda

if iteration == 1 or iteration % (10*verb_modulo) == 0 then
print('evals:\t ax-ratio max(std) f-value')
end
if iteration <= 2 or iteration % verb_modulo == 0 then
local max_std = math.sqrt(torch.max(torch.diag(C)))
print(tostring(counteval).. ': ' ..
string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std)
.. tostring(fitvals[1]))
end

return nil
end

while next(stop()) == nil or iteration < min_iterations do
iteration = iteration + 1

local X = ask() -- deliver candidate solutions
for i=1, lambda do
-- put candidate tensor back in input shape and evaluate in opfunc
local candidate = X[i]:viewAs(x)
fitvals[i] = objfunc(candidate)
end

tell(X)
disp(verb_disp)
end

local bestmu, f, c = best:get()
if verb_disp > 0 then
for k, v in pairs(stop()) do
print('termination by', k, '=', v)
end
print('best f-value =', f)
print('solution = ')
print(bestmu)
print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration)
end
table.insert(f_hist, f)

return bestmu, f_hist, counteval
end



BestSolution.__index = BestSolution
function BestSolution.new(x, f, evals)
local self = setmetatable({}, BestSolution)
self.x = x
self.f = f
self.evals = evals
return self
end

function BestSolution.update(self, arx, arf, evals)
--[[initialize the best solution with `x`, `f`, and `evals`.
Better solutions have smaller `f`-values.--]]
if self.f == nil or arf < self.f then
self.x = arx:clone()
self.f = arf
self.evals = evals
end
return self
end

function BestSolution.get(self)
return self.x, self.f, self.evals
end

+ 109
- 0
contrib/torch/optim/de.lua View File

@@ -0,0 +1,109 @@
--[[ An implementation of `DE` (Differential Evolution),

ARGS:

-`opfunc` : a function that takes a single input (X), the point of
evaluation, and returns f(X) and df/dX. Note that df/dX is not used
-`x` : the initial point
-`state.popsize`: population size. If this is left empty, 10*d will be used
-`state.scaleFactor`: float, usually between 0.4 and 1
-`state.crossoverRate`: float, usually between 0.1 and 0.9
-`state.maxEval`: int, maximal number of function evaluations

RETURN:
- `x*` : the new `x` vector, at the optimal point
- `f` : a table of all function values:
`f[1]` is the value of the function before any optimization and
`f[#f]` is the final fully optimized value, at `x*`
]]

require 'torch'

function optim.de(opfunc, x, config, state)
-- process input parameters
local config = config or {}
local state = state
local popsize = config.popsize -- population size
local scaleFactor = config.scaleFactor -- scale factor
local crossoverRate = config.crossoverRate -- crossover rate
local maxFEs = tonumber(config.maxFEs) -- maximal number of function evaluations
local maxRegion = config.maxRegion -- upper bound of search region
local minRegion = config.minRegion -- lower bound of search region
local xmean = x:clone():view(-1) -- distribution mean, a flattened copy
local D = xmean:size(1) -- number of objective variables/problem dimension

if config.popsize == nil then
popsize = 10 * D
end
if config.maxRegion == nil then
maxRegion = 30
end
if config.minRegion == nil then
minRegion = -30
end

-- Initialize population
local fx = x.new(maxFEs)
local pop = x.new(popsize, D)
local children = x.new(popsize, D)
local fitness = x.new(popsize)
local children_fitness = x.new(popsize)
local fes = 1 -- number of function evaluations
local best_fitness
local best_solution = x.new(D)

-- Initialize population and evaluate the its fitness value
local gen = torch.Generator()
torch.manualSeed(gen, 1)

pop:uniform(gen, minRegion, maxRegion)
for i = 1, popsize do
fitness[i] = opfunc(pop[i])
fx[fes] = fitness[i]
fes = fes + 1
end

-- Find the best solution
local index
best_fitness, index = fitness:max(1)
best_fitness = best_fitness[1]
index = index[1]
best_solution:copy(pop[index])

-- Main loop
while fes < maxFEs do
local r1, r2
for i = 1, popsize do
repeat
r1 = torch.random(gen, 1, popsize)
until(r1 ~= i)
repeat
r2 = torch.random(gen, 1, popsize)
until(r2 ~= r1 and r2 ~= i)

local jrand = torch.random(gen, 1, D)
for j = 1, D do
if torch.uniform(gen, 0, 1) < crossoverRate or i == jrand then
children[i][j] = best_solution[j] + scaleFactor * (pop[r1][j] - pop[r2][j])
else
children[i][j] = pop[i][j]
end
end
children_fitness[i] = opfunc(children[i])
fx[fes] = children_fitness[i]
fes = fes + 1
end

for i = 1, popsize do
if children_fitness[i] <= fitness[i] then
pop[i]:copy(children[i])
fitness[i] = children_fitness[i]
if fitness[i] < best_fitness then
best_fitness = fitness[i]
best_solution:copy(children[i])
end
end
end
end
return best_solution, fx
end

+ 192
- 0
contrib/torch/optim/fista.lua View File

@@ -0,0 +1,192 @@
--[[ FISTA with backtracking line search

- `f` : smooth function
- `g` : non-smooth function
- `pl` : minimizer of intermediate problem Q(x,y)
- `xinit` : initial point
- `params` : table of parameters (**optional**)
- `params.L` : 1/(step size) for ISTA/FISTA iteration (0.1)
- `params.Lstep` : step size multiplier at each iteration (1.5)
- `params.maxiter` : max number of iterations (50)
- `params.maxline` : max number of line search iterations per iteration (20)
- `params.errthres`: Error thershold for convergence check (1e-4)
- `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true)
- `params.verbose` : store each iteration solution and print detailed info (false)

On output, `params` will contain these additional fields that can be reused.

- `params.L` : last used L value will be written.

These are temporary storages needed by the algo and if the same params object is
passed a second time, these same storages will be used without new allocation.

- `params.xkm` : previous iterarion point
- `params.y` : fista iteration
- `params.ply` : ply = pl(y - 1/L grad(f))

Returns the solution x and history of {function evals, number of line search ,...}

Algorithm is published in

@article{beck-fista-09,
Author = {Beck, Amir and Teboulle, Marc},
Journal = {SIAM J. Img. Sci.},
Number = {1},
Pages = {183--202},
Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems},
Volume = {2},
Year = {2009}}
]]
function optim.FistaLS(f, g, pl, xinit, params)

local params = params or {}
local L = params.L or 0.1
local Lstep = params.Lstep or 1.5
local maxiter = params.maxiter or 50
local maxline = params.maxline or 20
local errthres = params.errthres or 1e-4
local doFistaUpdate = params.doFistaUpdate
local verbose = params.verbose

-- temporary allocations
params.xkm = params.xkm or torch.Tensor()
params.y = params.y or torch.Tensor()
params.ply = params.ply or torch.Tensor()
local xkm = params.xkm -- previous iteration
local y = params.y -- fista iteration
local ply = params.ply -- soft shrinked y

-- we start from all zeros
local xk = xinit
xkm:resizeAs(xk):zero()
ply:resizeAs(xk):zero()
y:resizeAs(xk):zero()

local history = {} -- keep track of stuff
local niter = 0 -- number of iterations done
local converged = false -- are we done?
local tk = 1 -- momentum param for FISTA
local tkp = 0


local gy = g(y)
local fval = math.huge -- fval = f+g
while not converged and niter < maxiter do

-- run through smooth function (code is input, input is target)
-- get derivatives from smooth function
local fy,gfy = f(y,'dx')
--local gfy = f(y)

local fply = 0
local gply = 0
local Q = 0

----------------------------------------------
-- do line search to find new current location starting from fista loc
local nline = 0
local linesearchdone = false
while not linesearchdone do
-- take a step in gradient direction of smooth function
ply:copy(y)
ply:add(-1/L,gfy)

-- and solve for minimum of auxiliary problem
pl(ply,L)
-- this is candidate for new current iteration
xk:copy(ply)

-- evaluate this point F(ply)
fply = f(ply)

-- ply - y
ply:add(-1, y)
-- <ply-y , \Grad(f(y))>
local Q2 = gfy:dot(ply)
-- L/2 ||beta-y||^2
local Q3 = L/2 * ply:dot(ply)
-- Q(beta,y) = F(y) + <beta-y , \Grad(F(y))> + L/2||beta-y||^2 + G(beta)
Q = fy + Q2 + Q3

if verbose then
print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3))
end
-- check if F(beta) < Q(pl(y),\t)
if fply <= Q then --and Fply + Gply <= F then
-- now evaluate G here
linesearchdone = true
elseif nline >= maxline then
linesearchdone = true
xk:copy(xkm) -- if we can't find a better point, current iter = previous iter
--print('oops')
else
L = L * Lstep
end
nline = nline + 1
end
-- end line search
---------------------------------------------

---------------------------------------------
-- FISTA
---------------------------------------------
if doFistaUpdate then
-- do the FISTA step
tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2
-- x(k-1) = x(k-1) - x(k)
xkm:add(-1,xk)
-- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k))
y:copy(xk)
y:add( (1-tk)/tkp , xkm)
-- store for next iterations
-- x(k-1) = x(k)
xkm:copy(xk)
else
y:copy(xk)
end
-- t(k) = t(k+1)
tk = tkp
fply = f(y)
gply = g(y)
if verbose then
print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply))
end

niter = niter + 1

-- bookeeping
fval = fply + gply
history[niter] = {}
history[niter].nline = nline
history[niter].L = L
history[niter].F = fval
history[niter].Fply = fply
history[niter].Gply = gply
history[niter].Q = Q
params.L = L
if verbose then
history[niter].xk = xk:clone()
history[niter].y = y:clone()
end

-- are we done?
if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then
converged = true
xinit:copy(y)
return y,history
end

if niter >= maxiter then
xinit:copy(y)
return y,history
end

--if niter > 1 and history[niter].F > history[niter-1].F then
--print(niter, 'This was supposed to be a convex function, we are going up')
--converged = true
--return xk,history
--end
end
error('not supposed to be here')
end


+ 33
- 0
contrib/torch/optim/init.lua View File

@@ -0,0 +1,33 @@

require 'torch'

optim = {}

-- optimizations
require('optim.sgd')
require('optim.cg')
require('optim.asgd')
require('optim.nag')
require('optim.fista')
require('optim.lbfgs')
require('optim.adagrad')
require('optim.rprop')
require('optim.adam')
require('optim.adamax')
require('optim.rmsprop')
require('optim.adadelta')
require('optim.cmaes')
require('optim.de')

-- line search functions
require('optim.lswolfe')

-- helpers
require('optim.polyinterp')
require('optim.checkgrad')

-- tools
require('optim.ConfusionMatrix')
require('optim.Logger')

return optim

+ 268
- 0
contrib/torch/optim/lbfgs.lua View File

@@ -0,0 +1,268 @@
--[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt)

This implementation of L-BFGS relies on a user-provided line
search function (state.lineSearch). If this function is not
provided, then a simple learningRate is used to produce fixed
size steps. Fixed size steps are much less costly than line
searches, and can be useful for stochastic problems.

The learning rate is used even when a line search is provided.
This is also useful for large-scale stochastic problems, where
opfunc is a noisy approximation of f(x). In that case, the learning
rate allows a reduction of confidence in the step size.

ARGS:

- `opfunc` : a function that takes a single input (X), the point of
evaluation, and returns f(X) and df/dX
- `x` : the initial point
- `state` : a table describing the state of the optimizer; after each
call the state is modified
- `state.maxIter` : Maximum number of iterations allowed
- `state.maxEval` : Maximum number of function evaluations
- `state.tolFun` : Termination tolerance on the first-order optimality
- `state.tolX` : Termination tol on progress in terms of func/param changes
- `state.lineSearch` : A line search function
- `state.learningRate` : If no line search provided, then a fixed step size is used

RETURN:
- `x*` : the new `x` vector, at the optimal point
- `f` : a table of all function values:
`f[1]` is the value of the function before any optimization and
`f[#f]` is the final fully optimized value, at `x*`

(Clement Farabet, 2012)
]]
function optim.lbfgs(opfunc, x, config, state)
-- get/update state
local config = config or {}
local state = state or config
local maxIter = tonumber(config.maxIter) or 20
local maxEval = tonumber(config.maxEval) or maxIter*1.25
local tolFun = config.tolFun or 1e-5
local tolX = config.tolX or 1e-9
local nCorrection = config.nCorrection or 100
local lineSearch = config.lineSearch
local lineSearchOpts = config.lineSearchOptions
local learningRate = config.learningRate or 1
local isverbose = config.verbose or false

state.funcEval = state.funcEval or 0
state.nIter = state.nIter or 0

-- verbose function
local verbose
if isverbose then
verbose = function(...) print('<optim.lbfgs> ', ...) end
else
verbose = function() end
end

-- import some functions
local abs = math.abs
local min = math.min

-- evaluate initial f(x) and df/dx
local f,g = opfunc(x)
local f_hist = {f}
local currentFuncEval = 1
state.funcEval = state.funcEval + 1
local p = g:size(1)

-- check optimality of initial point
state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1
tmp1:copy(g):abs()
if tmp1:sum() <= tolFun then
-- optimality condition below tolFun
verbose('optimality condition below tolFun')
return x,f_hist
end

if not state.dir_bufs then
-- reusable buffers for y's and s's, and their histories
verbose('creating recyclable direction/step/history buffers')
state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1)
state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1)
for i=1,#state.dir_bufs do
state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1)
state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1)
end
end

-- variables cached in state (for tracing)
local d = state.d
local t = state.t
local old_dirs = state.old_dirs
local old_stps = state.old_stps
local Hdiag = state.Hdiag
local g_old = state.g_old
local f_old = state.f_old

-- optimize for a max of maxIter iterations
local nIter = 0
while nIter < maxIter do
-- keep track of nb of iterations
nIter = nIter + 1
state.nIter = state.nIter + 1

------------------------------------------------------------
-- compute gradient descent direction
------------------------------------------------------------
if state.nIter == 1 then
d = g:clone():mul(-1) -- -g
old_dirs = {}
old_stps = {}
Hdiag = 1
else
-- do lbfgs update (update memory)
local y = table.remove(state.dir_bufs) -- pop
local s = table.remove(state.stp_bufs)
y:add(g, -1, g_old) -- g - g_old
s:mul(d, t) -- d*t
local ys = y:dot(s) -- y*s
if ys > 1e-10 then
-- updating memory
if #old_dirs == nCorrection then
-- shift history by one (limited-memory)
local removed1 = table.remove(old_dirs, 1)
local removed2 = table.remove(old_stps, 1)
table.insert(state.dir_bufs, removed1)
table.insert(state.stp_bufs, removed2)
end

-- store new direction/step
table.insert(old_dirs, s)
table.insert(old_stps, y)

-- update scale of initial Hessian approximation
Hdiag = ys / y:dot(y) -- (y*y)
else
-- put y and s back into the buffer pool
table.insert(state.dir_bufs, y)
table.insert(state.stp_bufs, s)
end

-- compute the approximate (L-BFGS) inverse Hessian
-- multiplied by the gradient
local k = #old_dirs

-- need to be accessed element-by-element, so don't re-type tensor:
state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro
for i = 1,k do
ro[i] = 1 / old_stps[i]:dot(old_dirs[i])
end

-- iteration in L-BFGS loop collapsed to use just one buffer
local q = tmp1 -- reuse tmp1 for the q buffer
-- need to be accessed element-by-element, so don't re-type tensor:
state.al = state.al or torch.zeros(nCorrection) local al = state.al

q:mul(g, -1) -- -g
for i = k,1,-1 do
al[i] = old_dirs[i]:dot(q) * ro[i]
q:add(-al[i], old_stps[i])
end

-- multiply by initial Hessian
r = d -- share the same buffer, since we don't need the old d
r:mul(q, Hdiag) -- q[1] * Hdiag
for i = 1,k do
local be_i = old_stps[i]:dot(r) * ro[i]
r:add(al[i]-be_i, old_dirs[i])
end
-- final direction is in r/d (same object)
end
g_old = g_old or g:clone()
g_old:copy(g)
f_old = f

------------------------------------------------------------
-- compute step length
------------------------------------------------------------
-- directional derivative
local gtd = g:dot(d) -- g * d

-- check that progress can be made along that direction
if gtd > -tolX then
break
end

-- reset initial guess for step size
if state.nIter == 1 then
tmp1:copy(g):abs()
t = min(1,1/tmp1:sum()) * learningRate
else
t = learningRate
end

-- optional line search: user function
local lsFuncEval = 0
if lineSearch and type(lineSearch) == 'function' then
-- perform line search, using user function
f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts)
table.insert(f_hist, f)
else
-- no line search, simply move with fixed-step
x:add(t,d)
if nIter ~= maxIter then
-- re-evaluate function only if not in last iteration
-- the reason we do this: in a stochastic setting,
-- no use to re-evaluate that function here
f,g = opfunc(x)
lsFuncEval = 1
table.insert(f_hist, f)
end
end

-- update func eval
currentFuncEval = currentFuncEval + lsFuncEval
state.funcEval = state.funcEval + lsFuncEval

------------------------------------------------------------
-- check conditions
------------------------------------------------------------
if nIter == maxIter then
-- no use to run tests
verbose('reached max number of iterations')
break
end

if currentFuncEval >= maxEval then
-- max nb of function evals
verbose('max nb of function evals')
break
end

tmp1:copy(g):abs()
if tmp1:sum() <= tolFun then
-- check optimality
verbose('optimality condition below tolFun')
break
end

tmp1:copy(d):mul(t):abs()
if tmp1:sum() <= tolX then
-- step size below tolX
verbose('step size below tolX')
break
end

if abs(f-f_old) < tolX then
-- function value changing less than tolX
verbose('function value changing less than tolX')
break
end
end

-- save state
state.old_dirs = old_dirs
state.old_stps = old_stps
state.Hdiag = Hdiag
state.g_old = g_old
state.f_old = f_old
state.t = t
state.d = d

-- return optimal x, and history of f(x)
return x,f_hist,currentFuncEval
end

+ 192
- 0
contrib/torch/optim/lswolfe.lua View File

@@ -0,0 +1,192 @@
--[[ A Line Search satisfying the Wolfe conditions

ARGS:
- `opfunc` : a function (the objective) that takes a single input (X),
the point of evaluation, and returns f(X) and df/dX
- `x` : initial point / starting location
- `t` : initial step size
- `d` : descent direction
- `f` : initial function value
- `g` : gradient at initial location
- `gtd` : directional derivative at starting location
- `options.c1` : sufficient decrease parameter
- `options.c2` : curvature parameter
- `options.tolX` : minimum allowable step length
- `options.maxIter` : maximum nb of iterations

RETURN:
- `f` : function value at x+t*d
- `g` : gradient value at x+t*d
- `x` : the next x (=x+t*d)
- `t` : the step length
- `lsFuncEval` : the number of function evaluations
]]
function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options)
-- options
options = options or {}
local c1 = options.c1 or 1e-4
local c2 = options.c2 or 0.9
local tolX = options.tolX or 1e-9
local maxIter = options.maxIter or 20
local isverbose = options.verbose or false

-- some shortcuts
local abs = torch.abs
local min = math.min
local max = math.max

-- verbose function
local function verbose(...)
if isverbose then print('<optim.lswolfe> ', ...) end
end

-- evaluate objective and gradient using initial step
local x_init = x:clone()
x:add(t,d)
local f_new,g_new = opfunc(x)
local lsFuncEval = 1
local gtd_new = g_new * d

-- bracket an interval containing a point satisfying the Wolfe
-- criteria
local LSiter,t_prev,done = 0,0,false
local f_prev,g_prev,gtd_prev = f,g:clone(),gtd
local bracket,bracketFval,bracketGval
while LSiter < maxIter do
-- check conditions:
if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then
bracket = x.new{t_prev,t}
bracketFval = x.new{f_prev,f_new}
bracketGval = x.new(2,g_new:size(1))
bracketGval[1] = g_prev
bracketGval[2] = g_new
break

elseif abs(gtd_new) <= -c2*gtd then
bracket = x.new{t}
bracketFval = x.new{f_new}
bracketGval = x.new(1,g_new:size(1))
bracketGval[1] = g_new
done = true
break

elseif gtd_new >= 0 then
bracket = x.new{t_prev,t}
bracketFval = x.new{f_prev,f_new}
bracketGval = x.new(2,g_new:size(1))
bracketGval[1] = g_prev
bracketGval[2] = g_new
break

end

-- interpolate:
local tmp = t_prev
t_prev = t
local minStep = t + 0.01*(t-tmp)
local maxStep = t*10
t = optim.polyinterp(x.new{{tmp,f_prev,gtd_prev},
{t,f_new,gtd_new}},
minStep, maxStep)

-- next step:
f_prev = f_new
g_prev = g_new:clone()
gtd_prev = gtd_new
x[{}] = x_init
x:add(t,d)
f_new,g_new = opfunc(x)
lsFuncEval = lsFuncEval + 1
gtd_new = g_new * d
LSiter = LSiter + 1
end

-- reached max nb of iterations?
if LSiter == maxIter then
bracket = x.new{0,t}
bracketFval = x.new{f,f_new}
bracketGval = x.new(2,g_new:size(1))
bracketGval[1] = g
bracketGval[2] = g_new
end

-- zoom phase: we now have a point satisfying the criteria, or
-- a bracket around it. We refine the bracket until we find the
-- exact point satisfying the criteria
local insufProgress = false
local LOposRemoved = 0
while not done and LSiter < maxIter do
-- find high and low points in bracket
local f_LO,LOpos = bracketFval:min(1)
LOpos = LOpos[1] f_LO = f_LO[1]
local HIpos = -LOpos+3

-- compute new trial value
t = optim.polyinterp(x.new{{bracket[1],bracketFval[1],bracketGval[1]*d},
{bracket[2],bracketFval[2],bracketGval[2]*d}})

-- test what we are making sufficient progress
if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then
if insufProgress or t>=bracket:max() or t <= bracket:min() then
if abs(t-bracket:max()) < abs(t-bracket:min()) then
t = bracket:max()-0.1*(bracket:max()-bracket:min())
else
t = bracket:min()+0.1*(bracket:max()-bracket:min())
end
insufProgress = false
else
insufProgress = true
end
else
insufProgress = false
end

-- Evaluate new point
x[{}] = x_init
x:add(t,d)
f_new,g_new = opfunc(x)
lsFuncEval = lsFuncEval + 1
gtd_new = g_new * d
LSiter = LSiter + 1
if f_new > f + c1*t*gtd or f_new >= f_LO then
-- Armijo condition not satisfied or not lower than lowest point
bracket[HIpos] = t
bracketFval[HIpos] = f_new
bracketGval[HIpos] = g_new
else
if abs(gtd_new) <= - c2*gtd then
-- Wolfe conditions satisfied
done = true
elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then
-- Old HI becomes new LO
bracket[HIpos] = bracket[LOpos]
bracketFval[HIpos] = bracketFval[LOpos]
bracketGval[HIpos] = bracketGval[LOpos]
end
-- New point becomes new LO
bracket[LOpos] = t
bracketFval[LOpos] = f_new
bracketGval[LOpos] = g_new
end

-- done?
if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then
break
end
end

-- be verbose
if LSiter == maxIter then
verbose('reached max number of iterations')
end

-- return stuff
local _,LOpos = bracketFval:min(1)
LOpos = LOpos[1]
t = bracket[LOpos]
f_new = bracketFval[LOpos]
g_new = bracketGval[LOpos]
x[{}] = x_init
x:add(t,d)
return f_new,g_new,x,t,lsFuncEval
end

+ 86
- 0
contrib/torch/optim/nag.lua View File

@@ -0,0 +1,86 @@
----------------------------------------------------------------------
-- An implementation of SGD adapted with features of Nesterov's
-- Accelerated Gradient method, based on the paper
-- On the Importance of Initialization and Momentum in Deep Learning
-- Sutsveker et. al., ICML 2013
--
-- ARGS:
-- opfunc : a function that takes a single input (X), the point of
-- evaluation, and returns f(X) and df/dX
-- x : the initial point
-- state : a table describing the state of the optimizer; after each
-- call the state is modified
-- state.learningRate : learning rate
-- state.learningRateDecay : learning rate decay
-- state.weightDecay : weight decay
-- state.momentum : momentum
-- state.learningRates : vector of individual learning rates
--
-- RETURN:
-- x : the new x vector
-- f(x) : the function, evaluated before the update
--
-- (Dilip Krishnan, 2013)
--

function optim.nag(opfunc, x, config, state)
-- (0) get/update state
local config = config or {}
local state = state or config
local lr = config.learningRate or 1e-3
local lrd = config.learningRateDecay or 0
local wd = config.weightDecay or 0
local mom = config.momentum or 0.9
local damp = config.dampening or mom
local lrs = config.learningRates
state.evalCounter = state.evalCounter or 0
local nevals = state.evalCounter

if mom <= 0 then
error('Momentum must be positive for Nesterov Accelerated Gradient')
end

-- (1) evaluate f(x) and df/dx
-- first step in the direction of the momentum vector

if state.dfdx then
x:add(mom, state.dfdx)
end
-- then compute gradient at that point
-- comment out the above line to get the original SGD
local fx,dfdx = opfunc(x)

-- (2) weight decay
if wd ~= 0 then
dfdx:add(wd, x)
end

-- (3) learning rate decay (annealing)
local clr = lr / (1 + nevals*lrd)

-- (4) apply momentum
if not state.dfdx then
state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0)
else
state.dfdx:mul(mom)
end

-- (5) parameter update with single or individual learning rates
if lrs then
if not state.deltaParameters then
state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
end
state.deltaParameters:copy(lrs):cmul(dfdx)
x:add(-clr, state.deltaParameters)
state.dfdx:add(-clr, state.deltaParameters)
else
x:add(-clr, dfdx)
state.dfdx:add(-clr, dfdx)
end

-- (6) update evaluation counter
state.evalCounter = state.evalCounter + 1

-- return x, f(x) before optimization
return x,{fx}
end

+ 212
- 0
contrib/torch/optim/polyinterp.lua View File

@@ -0,0 +1,212 @@
local function isreal(x)
return x == x
end

local function isnan(x)
return not x == x
end

local function roots(c)
local tol=1e-12
c[torch.lt(torch.abs(c),tol)]=0

local nonzero = torch.ne(c,0)
if nonzero:max() == 0 then
return 0
end

-- first non-zero
local _,pos = torch.max(nonzero,1)
pos = pos[1]
c=c[{ {pos,-1} }]

local nz = 0
for i=c:size(1),1,-1 do
if c[i] ~= 0 then
break
else
nz = nz + 1
end
end
c=c[{ {1,c:size(1)-nz} }]

local n = c:size(1)-1
if n == 1 then
local e = c.new({{-c[2]/c[1], 0}})
if nz > 0 then
return torch.cat(e, c.new(nz, 2):zero(), 1)
else
return e
end
elseif n > 1 then
local A = torch.diag(c.new(n-1):fill(1),-1)
A[1] = -c[{ {2,n+1} }]/c[1];
local e = torch.eig(A,'N')
if nz > 0 then
return torch.cat(e, c.new(nz,2):zero(), 1)
else
return e
end
else
return c.new(nz,2):zero()
end
end

local function real(x)
if type(x) == number then return x end
return x[{ {} , 1}]
end

local function imag(x)
if type(x) == 'number' then return 0 end
if x:nDimension() == 1 then
return x.new(x:size(1)):zero()
else
return x[{ {}, 2}]
end
end

local function polyval(p,x)
local pwr = p:size(1)
if type(x) == 'number' then
local val = 0
p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end)
return val
else
local val = x.new(x:size(1))
p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end)
return val
end
end

----------------------------------------------------------------------
-- Minimum of interpolating polynomial based on function and
-- derivative values
--
-- ARGS:
-- points : N triplets (x,f,g), must be a Tensor
-- xmin : min value that brackets minimum (default: min of points)
-- xmax : max value that brackets maximum (default: max of points)
--
-- RETURN:
-- minPos : position of minimum
--
function optim.polyinterp(points,xminBound,xmaxBound)
-- locals
local sqrt = torch.sqrt
local mean = torch.mean
local max = math.max
local min = math.min

-- nb of points / order of polynomial
local nPoints = points:size(1)
local order = nPoints*2-1

-- returned values
local minPos

-- Code for most common case:
-- + cubic interpolation of 2 points w/ function and derivative values for both
-- + no xminBound/xmaxBound
if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then
-- Solution in this case (where x2 is the farthest point):
-- d1 = g1 + g2 - 3*(f1-f2)/(x1-x2);
-- d2 = sqrt(d1^2 - g1*g2);
-- minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2));
-- t_new = min(max(minPos,x1),x2);
local minVal,minPos = points[{ {},1 }]:min(1)
minVal = minVal[1] minPos = minPos[1]
local notMinPos = -minPos+3;

local d1 = points[{minPos,3}] + points[{notMinPos,3}]
- 3*(points[{minPos,2}]-points[{notMinPos,2}])
/ (points[{minPos,1}]-points[{notMinPos,1}]);
local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]);

if isreal(d2) then -- isreal()
local t = points[{notMinPos,1}] - (points[{notMinPos,1}]
- points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1)
/ (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2))

minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}])
else
minPos = mean(points[{{},1}])
end
return minPos
end

-- TODO: get the code below to work!
--error('<optim.polyinterp> extrapolation not implemented yet...')

-- Compute Bounds of Interpolation Area
local xmin = points[{{},1}]:min()
local xmax = points[{{},1}]:max()
xminBound = xminBound or xmin
xmaxBound = xmaxBound or xmax

-- Add constraints on function values
local A = points.new(nPoints*2,order+1):zero()
local b = points.new(nPoints*2,1):zero()
for i = 1,nPoints do
local constraint = points.new(order+1):zero()
for j = order,0,-1 do
constraint[order-j+1] = points[{i,1}]^j
end
A[i] = constraint
b[i] = points[{i,2}]
end

-- Add constraints based on derivatives
for i = 1,nPoints do
local constraint = points.new(order+1):zero()
for j = 1,order do
constraint[j] = (order-j+1)*points[{i,1}]^(order-j)
end
A[nPoints+i] = constraint
b[nPoints+i] = points[{i,3}]
end

-- Find interpolating polynomial
local res = torch.gels(b,A)
local params = res[{ {1,nPoints*2} }]:squeeze()

params[torch.le(torch.abs(params),1e-12)]=0

-- Compute Critical Points
local dParams = points.new(order):zero();
for i = 1,params:size(1)-1 do
dParams[i] = params[i]*(order-i+1)
end

-- nan/inf?
local nans = false
if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then
nans = true
end

local cp = torch.cat(points.new{xminBound,xmaxBound},points[{{},1}])
if not nans then
local cproots = roots(dParams)
local cpi = points.new(cp:size(1),2):zero()
cpi[{ {1,cp:size(1)} , 1 }] = cp
cp = torch.cat(cpi,cproots,1)
end

-- Test Critical Points
local fmin = math.huge
-- Default to Bisection if no critical points valid:
minPos = (xminBound+xmaxBound)/2
for i = 1,cp:size(1) do
local xCP = cp[{ {i,i} , {} }]
local ixCP = imag(xCP)[1]
local rxCP = real(xCP)[1]
if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then
local fCP = polyval(params,rxCP)
if fCP < fmin then
minPos = rxCP
fmin = fCP
end
end
end
return minPos,fmin
end

+ 58
- 0
contrib/torch/optim/rmsprop.lua View File

@@ -0,0 +1,58 @@
--[[ An implementation of RMSprop

ARGS:

- 'opfunc' : a function that takes a single input (X), the point
of a evaluation, and returns f(X) and df/dX
- 'x' : the initial point
- 'config` : a table with configuration parameters for the optimizer
- 'config.learningRate' : learning rate
- 'config.alpha' : smoothing constant
- 'config.epsilon' : value with which to initialise m
- 'config.weightDecay' : weight decay
- 'state' : a table describing the state of the optimizer;
after each call the state is modified
- 'state.m' : leaky sum of squares of parameter gradients,
- 'state.tmp' : and the square root (with epsilon smoothing)

RETURN:
- `x` : the new x vector
- `f(x)` : the function, evaluated before the update

]]

function optim.rmsprop(opfunc, x, config, state)
-- (0) get/update state
local config = config or {}
local state = state or config
local lr = config.learningRate or 1e-2
local alpha = config.alpha or 0.99
local epsilon = config.epsilon or 1e-8
local wd = config.weightDecay or 0
local mfill = config.initialMean or 0

-- (1) evaluate f(x) and df/dx
local fx, dfdx = opfunc(x)

-- (2) weight decay
if wd ~= 0 then
dfdx:add(wd, x)
end

-- (3) initialize mean square values and square gradient storage
if not state.m then
state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(mfill)
state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx)
end

-- (4) calculate new (leaky) mean squared values
state.m:mul(alpha)
state.m:addcmul(1.0-alpha, dfdx, dfdx)

-- (5) perform update
state.tmp:sqrt(state.m):add(epsilon)
x:addcdiv(-lr, dfdx, state.tmp)

-- return x*, f(x) before optimization
return x, {fx}
end

+ 103
- 0
contrib/torch/optim/rprop.lua View File

@@ -0,0 +1,103 @@
--[[ A plain implementation of RPROP

ARGS:
- `opfunc` : a function that takes a single input (X), the point of
evaluation, and returns f(X) and df/dX
- `x` : the initial point
- `state` : a table describing the state of the optimizer; after each
call the state is modified
- `state.stepsize` : initial step size, common to all components
- `state.etaplus` : multiplicative increase factor, > 1 (default 1.2)
- `state.etaminus` : multiplicative decrease factor, < 1 (default 0.5)
- `state.stepsizemax` : maximum stepsize allowed (default 50)
- `state.stepsizemin` : minimum stepsize allowed (default 1e-6)
- `state.niter` : number of iterations (default 1)

RETURN:
- `x` : the new x vector
- `f(x)` : the function, evaluated before the update

(Martin Riedmiller, Koray Kavukcuoglu 2013)
--]]
function optim.rprop(opfunc, x, config, state)
if config == nil and state == nil then
print('no state table RPROP initializing')
end
-- (0) get/update state
local config = config or {}
local state = state or config
local stepsize = config.stepsize or 0.1
local etaplus = config.etaplus or 1.2
local etaminus = config.etaminus or 0.5
local stepsizemax = config.stepsizemax or 50.0
local stepsizemin = config.stepsizemin or 1E-06
local niter = config.niter or 1

local hfx = {}

for i=1,niter do

-- (1) evaluate f(x) and df/dx
local fx,dfdx = opfunc(x)

-- init temp storage
if not state.delta then
state.delta = dfdx.new(dfdx:size()):zero()
state.stepsize = dfdx.new(dfdx:size()):fill(stepsize)
state.sign = dfdx.new(dfdx:size())
state.psign = torch.ByteTensor(dfdx:size())
state.nsign = torch.ByteTensor(dfdx:size())
state.zsign = torch.ByteTensor(dfdx:size())
state.dminmax = torch.ByteTensor(dfdx:size())
if torch.type(x)=='torch.CudaTensor' then
-- Push to GPU
state.psign = state.psign:cuda()
state.nsign = state.nsign:cuda()
state.zsign = state.zsign:cuda()
state.dminmax = state.dminmax:cuda()
end
end

-- sign of derivative from last step to this one
torch.cmul(state.sign, dfdx, state.delta)
torch.sign(state.sign, state.sign)

-- get indices of >0, <0 and ==0 entries
state.sign.gt(state.psign, state.sign, 0)
state.sign.lt(state.nsign, state.sign, 0)
state.sign.eq(state.zsign, state.sign, 0)

-- get step size updates
state.sign[state.psign] = etaplus
state.sign[state.nsign] = etaminus
state.sign[state.zsign] = 1

-- update stepsizes with step size updates
state.stepsize:cmul(state.sign)

-- threshold step sizes
-- >50 => 50
state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax)
state.stepsize[state.dminmax] = stepsizemax
-- <1e-6 ==> 1e-6
state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin)
state.stepsize[state.dminmax] = stepsizemin

-- for dir<0, dfdx=0
-- for dir>=0 dfdx=dfdx
dfdx[state.nsign] = 0
-- state.sign = sign(dfdx)
torch.sign(state.sign,dfdx)

-- update weights
x:addcmul(-1,state.sign,state.stepsize)

-- update state.dfdx with current dfdx
state.delta:copy(dfdx)

table.insert(hfx,fx)
end

-- return x*, f(x) before optimization
return x,hfx
end

+ 90
- 0
contrib/torch/optim/sgd.lua View File

@@ -0,0 +1,90 @@
--[[ A plain implementation of SGD

ARGS:

- `opfunc` : a function that takes a single input (X), the point
of a evaluation, and returns f(X) and df/dX
- `x` : the initial point
- `config` : a table with configuration parameters for the optimizer
- `config.learningRate` : learning rate
- `config.learningRateDecay` : learning rate decay
- `config.weightDecay` : weight decay
- `config.weightDecays` : vector of individual weight decays
- `config.momentum` : momentum
- `config.dampening` : dampening for momentum
- `config.nesterov` : enables Nesterov momentum
- `config.learningRates` : vector of individual learning rates
- `state` : a table describing the state of the optimizer; after each
call the state is modified
- `state.evalCounter` : evaluation counter (optional: 0, by default)

RETURN:
- `x` : the new x vector
- `f(x)` : the function, evaluated before the update

(Clement Farabet, 2012)
]]
function optim.sgd(opfunc, x, config, state)
-- (0) get/update state
local config = config or {}
local state = state or config
local lr = config.learningRate or 1e-3
local lrd = config.learningRateDecay or 0
local wd = config.weightDecay or 0
local mom = config.momentum or 0
local damp = config.dampening or mom
local nesterov = config.nesterov or false
local lrs = config.learningRates
local wds = config.weightDecays
state.evalCounter = state.evalCounter or 0
local nevals = state.evalCounter
assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening")

-- (1) evaluate f(x) and df/dx
local fx,dfdx = opfunc(x)

-- (2) weight decay with single or individual parameters
if wd ~= 0 then
dfdx:add(wd, x)
elseif wds then
if not state.decayParameters then
state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
end
state.decayParameters:copy(wds):cmul(x)
dfdx:add(state.decayParameters)
end

-- (3) apply momentum
if mom ~= 0 then
if not state.dfdx then
state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx)
else
state.dfdx:mul(mom):add(1-damp, dfdx)
end
if nesterov then
dfdx:add(mom, state.dfdx)
else
dfdx = state.dfdx
end
end

-- (4) learning rate decay (annealing)
local clr = lr / (1 + nevals*lrd)

-- (5) parameter update with single or individual learning rates
if lrs then
if not state.deltaParameters then
state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
end
state.deltaParameters:copy(lrs):cmul(dfdx)
x:add(-clr, state.deltaParameters)
else
x:add(-clr, dfdx)
end

-- (6) update evaluation counter
state.evalCounter = state.evalCounter + 1

-- return x*, f(x) before optimization
return x,{fx}
end

Loading…
Cancel
Save