6 years ago · 478687cd46
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1259,6 +1259,7 @@ IF(ENABLE_TORCH MATCHES "ON")
 		ADD_SUBDIRECTORY(contrib/torch/paths)
 		ADD_SUBDIRECTORY(contrib/torch/torch7)
 		ADD_SUBDIRECTORY(contrib/torch/nn)
 		ADD_SUBDIRECTORY(contrib/torch/optim)
 		ADD_SUBDIRECTORY(contrib/torch/decisiontree)
 		SET(WITH_TORCH 1)
 	ELSE()
--- a/contrib/torch/optim/CMakeLists.txt
+++ b/contrib/torch/optim/CMakeLists.txt
@@ -0,0 +1,5 @@

 CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
 SET(src)
 FILE(GLOB luasrc *.lua)
 ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}")
--- a/contrib/torch/optim/COPYRIGHT.txt
+++ b/contrib/torch/optim/COPYRIGHT.txt
@@ -0,0 +1,35 @@
 Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 Copyright (c) 2011-2013 NYU (Clement Farabet)
 Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)

 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

 1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.

 2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.

 3. Neither the names of NEC Laboratories American and IDIAP Research
   Institute nor the names of its contributors may be used to endorse or
   promote products derived from this software without specific prior
   written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
--- a/contrib/torch/optim/ConfusionMatrix.lua
+++ b/contrib/torch/optim/ConfusionMatrix.lua
@@ -0,0 +1,361 @@
 --[[ A Confusion Matrix class

 Example:

    conf = optim.ConfusionMatrix( {'cat','dog','person'} )   -- new matrix
    conf:zero()                                              -- reset matrix
    for i = 1,N do
        conf:add( neuralnet:forward(sample), label )         -- accumulate errors
    end
    print(conf)                                              -- print matrix
    image.display(conf:render())                             -- render matrix
 ]]
 local ConfusionMatrix = torch.class('optim.ConfusionMatrix')

 function ConfusionMatrix:__init(nclasses, classes)
   if type(nclasses) == 'table' then
      classes = nclasses
      nclasses = #classes
   end
   self.mat = torch.LongTensor(nclasses,nclasses):zero()
   self.valids = torch.FloatTensor(nclasses):zero()
   self.unionvalids = torch.FloatTensor(nclasses):zero()
   self.nclasses = nclasses
   self.totalValid = 0
   self.averageValid = 0
   self.classes = classes or {}
   -- buffers
   self._mat_flat = self.mat:view(-1)
   self._target = torch.FloatTensor()
   self._prediction = torch.FloatTensor()
   self._max = torch.FloatTensor()
   self._pred_idx = torch.LongTensor()
   self._targ_idx = torch.LongTensor()
 end

 -- takes scalar prediction and target as input
 function ConfusionMatrix:_add(p, t)
   assert(p and type(p) == 'number')
   assert(t and type(t) == 'number')
   -- non-positive values are considered missing
   -- and therefore ignored
   if t > 0 then
      self.mat[t][p] = self.mat[t][p] + 1
   end
 end

 function ConfusionMatrix:add(prediction, target)
   if type(prediction) == 'number' then
      -- comparing numbers
      self:_add(prediction, target)
   else
      self._prediction:resize(prediction:size()):copy(prediction)
      assert(prediction:dim() == 1)
      if type(target) == 'number' then
         -- prediction is a vector, then target assumed to be an index
         self._max:max(self._pred_idx, self._prediction, 1)
         self:_add(self._pred_idx[1], target)
      else
         -- both prediction and target are vectors
         assert(target:dim() == 1)
         self._target:resize(target:size()):copy(target)
         self._max:max(self._targ_idx, self._target, 1)
         self._max:max(self._pred_idx, self._prediction, 1)
         self:_add(self._pred_idx[1], self._targ_idx[1])
      end
   end
 end

 function ConfusionMatrix:batchAdd(predictions, targets)
   local preds, targs, __
   self._prediction:resize(predictions:size()):copy(predictions)
   if predictions:dim() == 1 then
      -- predictions is a vector of classes
      preds = self._prediction
   elseif predictions:dim() == 2 then
      -- prediction is a matrix of class likelihoods
      if predictions:size(2) == 1 then
         -- or prediction just needs flattening
         preds = self._prediction:select(2,1)
      else
         self._max:max(self._pred_idx, self._prediction, 2)
         preds = self._pred_idx:select(2,1)
      end
   else
      error("predictions has invalid number of dimensions")
   end

   self._target:resize(targets:size()):copy(targets)
   if targets:dim() == 1 then
      -- targets is a vector of classes
      targs = self._target
   elseif targets:dim() == 2 then
      -- targets is a matrix of one-hot rows
      if targets:size(2) == 1 then
         -- or targets just needs flattening
         targs = self._target:select(2,1)
      else
         self._max:max(self._targ_idx, self._target, 2)
         targs = self._targ_idx:select(2,1)
      end
   else
      error("targets has invalid number of dimensions")
   end

   -- non-positive values are considered missing and therefore ignored
   local mask = targs:ge(1)
   targs = targs[mask]
   preds = preds[mask]

   self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility

   preds = preds:typeAs(targs)

   assert(self.mat:isContiguous() and self.mat:stride(2) == 1)
   local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat)
   local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1))
   self._mat_flat:indexAdd(1, indices, ones)
 end

 function ConfusionMatrix:zero()
   self.mat:zero()
   self.valids:zero()
   self.unionvalids:zero()
   self.totalValid = 0
   self.averageValid = 0
 end

 local function isNaN(number)
  return number ~= number
 end

 function ConfusionMatrix:updateValids()
   local total = 0
   for t = 1,self.nclasses do
      self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum()
      self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t])
      total = total + self.mat[t][t]
   end
   self.totalValid = total / self.mat:sum()
   self.averageValid = 0
   self.averageUnionValid = 0
   local nvalids = 0
   local nunionvalids = 0
   for t = 1,self.nclasses do
      if not isNaN(self.valids[t]) then
         self.averageValid = self.averageValid + self.valids[t]
         nvalids = nvalids + 1
      end
      if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then
         self.averageUnionValid = self.averageUnionValid + self.unionvalids[t]
         nunionvalids = nunionvalids + 1
      end
   end
   self.averageValid = self.averageValid / nvalids
   self.averageUnionValid = self.averageUnionValid / nunionvalids
 end

 -- Calculating FAR/FRR associated with the confusion matrix

 function ConfusionMatrix:farFrr()
   local cmat = self.mat
   local noOfClasses = cmat:size()[1]
   self._frrs = self._frrs or torch.zeros(noOfClasses)
   self._frrs:zero()
   self._classFrrs = self._classFrrs or torch.zeros(noOfClasses)
   self._classFrrs:zero()
   self._classFrrs:add(-1)
   self._fars = self._fars or torch.zeros(noOfClasses)
   self._fars:zero()
   self._classFars = self._classFars or torch.zeros(noOfClasses)
   self._classFars:zero()
   self._classFars:add(-1)
   local classSamplesCount = cmat:sum(2)
   local indx = 1
   for i=1,noOfClasses do
      if classSamplesCount[i][1] ~= 0 then
         self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1]
         self._classFrrs[i] = self._frrs[indx]
         -- Calculating FARs
         local farNumerator = 0
         local farDenominator = 0
         for j=1, noOfClasses do
            if i ~= j then
               if classSamplesCount[j][1] ~= 0 then
                  farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1]
                  farDenominator  = farDenominator + 1
               end
            end
         end
         self._fars[indx] = farNumerator/farDenominator
         self._classFars[i] = self._fars[indx]
         indx = indx + 1
      end
   end
   indx = indx - 1
   local returnFrrs = self._frrs[{{1, indx}}]
   local returnFars = self._fars[{{1, indx}}]
   return self._classFrrs, self._classFars, returnFrrs, returnFars
 end

 local function log10(n)
   if math.log10 then
      return math.log10(n)
   else
      return math.log(n) / math.log(10)
   end
 end

 function ConfusionMatrix:__tostring__()
   self:updateValids()
   local str = {'ConfusionMatrix:\n'}
   local nclasses = self.nclasses
   table.insert(str, '[')
   local maxCnt = self.mat:max()
   local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt)))
   for t = 1,nclasses do
      local pclass = self.valids[t] * 100
      pclass = string.format('%2.3f', pclass)
      if t == 1 then
         table.insert(str, '[')
      else
         table.insert(str, ' [')
      end
      for p = 1,nclasses do
         table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p]))
      end
      if self.classes and self.classes[1] then
         if t == nclasses then
            table.insert(str, ']]  ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
         else
            table.insert(str, ']   ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
         end
      else
         if t == nclasses then
            table.insert(str, ']]  ' .. pclass .. '% \n')
         else
            table.insert(str, ']   ' .. pclass .. '% \n')
         end
      end
   end
   table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n')
   table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n')
   table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%')
   return table.concat(str)
 end

 function ConfusionMatrix:render(sortmode, display, block, legendwidth)
   -- args
   local confusion = self.mat:double()
   local classes = self.classes
   local sortmode = sortmode or 'score' -- 'score' or 'occurrence'
   local block = block or 25
   local legendwidth = legendwidth or 200
   local display = display or false

   -- legends
   local legend = {
      ['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]',
      ['occurrence'] = 'Confusion matrix [sorted by occurrences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]'
   }

   -- parse matrix / normalize / count scores
   local diag = torch.FloatTensor(#classes)
   local freqs = torch.FloatTensor(#classes)
   local unconf = confusion
   local confusion = confusion:clone()
   local corrects = 0
   local total = 0
   for target = 1,#classes do
      freqs[target] = confusion[target]:sum()
      corrects = corrects + confusion[target][target]
      total = total + freqs[target]
      confusion[target]:div( math.max(confusion[target]:sum(),1) )
      diag[target] = confusion[target][target]
   end

   -- accuracies
   local accuracy = corrects / total * 100
   local perclass = 0
   local total = 0
   for target = 1,#classes do
      if confusion[target]:sum() > 0 then
         perclass = perclass + diag[target]
         total = total + 1
      end
   end
   perclass = perclass / total * 100
   freqs:div(unconf:sum())

   -- sort matrix
   if sortmode == 'score' then
      _,order = torch.sort(diag,1,true)
   elseif sortmode == 'occurrence' then
      _,order = torch.sort(freqs,1,true)
   else
      error('sort mode must be one of: score | occurrence')
   end

   -- render matrix
   local render = torch.zeros(#classes*block, #classes*block)
   for target = 1,#classes do
      for prediction = 1,#classes do
         render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]]
      end
   end

   -- add grid
   for target = 1,#classes do
      render[{ {target*block},{} }] = 0.1
      render[{ {},{target*block} }] = 0.1
   end

   -- create rendering
   require 'image'
   require 'qtwidget'
   require 'qttorch'
   local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] )
   image.display{image=render, win=win1}

   -- add legend
   for i in ipairs(classes) do
      -- background cell
      win1:setcolor{r=0,g=0,b=0}
      win1:rectangle((#render)[2],(i-1)*block,legendwidth,block)
      win1:fill()

      -- %
      win1:setfont(qt.QFont{serif=false, size=fontsize})
      local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4
      win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2}
      win1:moveto((#render)[2]+10,i*block-block/3)
      win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100))

      -- legend
      win1:setfont(qt.QFont{serif=false, size=fontsize})
      local gscale = diag[order[i]]*0.8+0.2
      win1:setcolor{r=gscale,g=gscale,b=gscale}
      win1:moveto(120+(#render)[2]+10,i*block-block/3)
      win1:show(classes[order[i]])

      for j in ipairs(classes) do
         -- scores
         local score = confusion[order[j]][order[i]]
         local gscale = (1-score)*(score*0.8+0.2)
         win1:setcolor{r=gscale,g=gscale,b=gscale}
         win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3)
         win1:show(string.format('%02.0f',math.floor(score*100+0.5)))
      end
   end

   -- generate tensor
   local t = win1:image():toTensor()

   -- display
   if display then
      image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)}
   end

   -- return rendering
   return t
 end
--- a/contrib/torch/optim/Logger.lua
+++ b/contrib/torch/optim/Logger.lua
@@ -0,0 +1,190 @@
 --[[ Logger: a simple class to log symbols during training,
        and automate plot generation

 Example:
    logger = optim.Logger('somefile.log')    -- file to save stuff

    for i = 1,N do                           -- log some symbols during
        train_error = ...                     -- training/testing
        test_error = ...
        logger:add{['training error'] = train_error,
            ['test error'] = test_error}
    end

    logger:style{['training error'] = '-',   -- define styles for plots
                 ['test error'] = '-'}
    logger:plot()                            -- and plot

 ---- OR ---

    logger = optim.Logger('somefile.log')    -- file to save stuff
    logger:setNames{'training error', 'test error'}

    for i = 1,N do                           -- log some symbols during
       train_error = ...                     -- training/testing
       test_error = ...
       logger:add{train_error, test_error}
    end

    logger:style{'-', '-'}                   -- define styles for plots
    logger:plot()                            -- and plot

 -----------

    logger:setlogscale(true)                 -- enable logscale on Y-axis
    logger:plot()                            -- and plot
 ]]
 require 'xlua'
 local Logger = torch.class('optim.Logger')

 function Logger:__init(filename, timestamp)
   if filename then
      self.name = filename
      os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"')
      if timestamp then
         -- append timestamp to create unique log file
         filename = filename .. '-'..os.date("%Y_%m_%d_%X")
      end
      self.file = io.open(filename,'w')
      self.epsfile = self.name .. '.eps'
   else
      self.file = io.stdout
      self.name = 'stdout'
      print('<Logger> warning: no path provided, logging to std out')
   end
   self.empty = true
   self.symbols = {}
   self.styles = {}
   self.names = {}
   self.idx = {}
   self.figure = nil
   self.showPlot = true
   self.plotRawCmd = nil
   self.defaultStyle = '+'
   self.logscale = false
 end

 function Logger:setNames(names)
   self.names = names
   self.empty = false
   self.nsymbols = #names
   for k,key in pairs(names) do
      self.file:write(key .. '\t')
      self.symbols[k] = {}
      self.styles[k] = {self.defaultStyle}
      self.idx[key] = k
   end
   self.file:write('\n')
   self.file:flush()
   return self
 end

 function Logger:add(symbols)
   -- (1) first time ? print symbols' names on first row
   if self.empty then
      self.empty = false
      self.nsymbols = #symbols
      for k,val in pairs(symbols) do
         self.file:write(k .. '\t')
         self.symbols[k] = {}
         self.styles[k] = {self.defaultStyle}
         self.names[k] = k
      end
      self.idx = self.names
      self.file:write('\n')
   end
   -- (2) print all symbols on one row
   for k,val in pairs(symbols) do
      if type(val) == 'number' then
         self.file:write(string.format('%11.4e',val) .. '\t')
      elseif type(val) == 'string' then
         self.file:write(val .. '\t')
      else
         xlua.error('can only log numbers and strings', 'Logger')
      end
   end
   self.file:write('\n')
   self.file:flush()
   -- (3) save symbols in internal table
   for k,val in pairs(symbols) do
      table.insert(self.symbols[k], val)
   end
 end

 function Logger:style(symbols)
   for name,style in pairs(symbols) do
      if type(style) == 'string' then
         self.styles[name] = {style}
      elseif type(style) == 'table' then
         self.styles[name] = style
      else
         xlua.error('style should be a string or a table of strings','Logger')
      end
   end
   return self
 end

 function Logger:setlogscale(state)
   self.logscale = state
 end

 function Logger:display(state)
   self.showPlot = state
 end

 function Logger:plot(...)
   if not xlua.require('gnuplot') then
      if not self.warned then
         print('<Logger> warning: cannot plot with this version of Torch')
         self.warned = true
      end
      return
   end
   local plotit = false
   local plots = {}
   local plotsymbol =
      function(name,list)
         if #list > 1 then
            local nelts = #list
            local plot_y = torch.Tensor(nelts)
            for i = 1,nelts do
               plot_y[i] = list[i]
            end
            for _,style in ipairs(self.styles[name]) do
               table.insert(plots, {self.names[name], plot_y, style})
            end
            plotit = true
         end
      end
   local args = {...}
   if not args[1] then -- plot all symbols
      for name,list in pairs(self.symbols) do
         plotsymbol(name,list)
      end
   else -- plot given symbols
      for _,name in ipairs(args) do
         plotsymbol(self.idx[name], self.symbols[self.idx[name]])
      end
   end
   if plotit then
      if self.showPlot then
         self.figure = gnuplot.figure(self.figure)
         if self.logscale then gnuplot.logscale('on') end
         gnuplot.plot(plots)
         if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
         gnuplot.grid('on')
         gnuplot.title('<Logger::' .. self.name .. '>')
      end
      if self.epsfile then
         os.execute('rm -f "' .. self.epsfile .. '"')
         local epsfig = gnuplot.epsfigure(self.epsfile)
         if self.logscale then gnuplot.logscale('on') end
         gnuplot.plot(plots)
         if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
         gnuplot.grid('on')
         gnuplot.title('<Logger::' .. self.name .. '>')
         gnuplot.plotflush()
         gnuplot.close(epsfig)
      end
   end
 end
--- a/contrib/torch/optim/adadelta.lua
+++ b/contrib/torch/optim/adadelta.lua
@@ -0,0 +1,55 @@
 --[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701

 ARGS:
 - `opfunc` : a function that takes a single input (X), the point of
            evaluation, and returns f(X) and df/dX
 - `x` : the initial point
 - `config` : a table of hyper-parameters
 - `config.rho` : interpolation parameter
 - `config.eps` : for numerical stability
 - `config.weightDecay` : weight decay
 - `state` : a table describing the state of the optimizer; after each
         call the state is modified
 - `state.paramVariance` : vector of temporal variances of parameters
 - `state.accDelta` : vector of accummulated delta of gradients
 RETURN:
 - `x` : the new x vector
 - `f(x)` : the function, evaluated before the update
 ]]
 function optim.adadelta(opfunc, x, config, state)
    -- (0) get/update state
    if config == nil and state == nil then
        print('no state table, ADADELTA initializing')
    end
    local config = config or {}
    local state = state or config
    local rho = config.rho or 0.9
    local eps = config.eps or 1e-6
    local wd = config.weightDecay or 0
    state.evalCounter = state.evalCounter or 0
    -- (1) evaluate f(x) and df/dx
    local fx,dfdx = opfunc(x)

    -- (2) weight decay
    if wd ~= 0 then
      dfdx:add(wd, x)
    end

    -- (3) parameter update
    if not state.paramVariance then
        state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
        state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
        state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
        state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
    end
    state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx)
    state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt()
    state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx)
    x:add(-1, state.delta)
    state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta)
    -- (4) update evaluation counter
    state.evalCounter = state.evalCounter + 1

    -- return x*, f(x) before optimization
    return x,{fx}
 end
--- a/contrib/torch/optim/adagrad.lua
+++ b/contrib/torch/optim/adagrad.lua
@@ -0,0 +1,55 @@
 --[[ ADAGRAD implementation for SGD

 ARGS:
 - `opfunc` : a function that takes a single input (X), the point of
         evaluation, and returns f(X) and df/dX
 - `x` : the initial point
 - `state` : a table describing the state of the optimizer; after each
         call the state is modified
 - `state.learningRate` : learning rate
 - `state.paramVariance` : vector of temporal variances of parameters
 - `state.weightDecay` : scalar that controls weight decay
 RETURN:
 - `x` : the new x vector
 - `f(x)` : the function, evaluated before the update

 ]]
 function optim.adagrad(opfunc, x, config, state)
   -- (0) get/update state
   if config == nil and state == nil then
      print('no state table, ADAGRAD initializing')
   end
   local config = config or {}
   local state = state or config
   local lr = config.learningRate or 1e-3
   local lrd = config.learningRateDecay or 0
   local wd = config.weightDecay or 0
   state.evalCounter = state.evalCounter or 0
   local nevals = state.evalCounter

   -- (1) evaluate f(x) and df/dx
   local fx,dfdx = opfunc(x)

   -- (2) weight decay with a single parameter
   if wd ~= 0 then
       dfdx:add(wd, x)
   end

   -- (3) learning rate decay (annealing)
   local clr = lr / (1 + nevals*lrd)

   -- (4) parameter update with single or individual learning rates
   if not state.paramVariance then
      state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
      state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx)
   end
   state.paramVariance:addcmul(1,dfdx,dfdx)
   state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt()
   x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10))

   -- (5) update evaluation counter
   state.evalCounter = state.evalCounter + 1

   -- return x*, f(x) before optimization
   return x,{fx}
 end
--- a/contrib/torch/optim/adam.lua
+++ b/contrib/torch/optim/adam.lua
@@ -0,0 +1,72 @@
 --[[ An implementation of Adam https://arxiv.org/abs/1412.6980

 ARGS:

 - 'opfunc' : a function that takes a single input (X), the point
             of a evaluation, and returns f(X) and df/dX
 - 'x'      : the initial point
 - 'config` : a table with configuration parameters for the optimizer
 - 'config.learningRate'      : learning rate
 - `config.learningRateDecay` : learning rate decay
 - 'config.beta1'             : first moment coefficient
 - 'config.beta2'             : second moment coefficient
 - 'config.epsilon'           : for numerical stability
 - 'config.weightDecay'       : weight decay
 - 'state'                    : a table describing the state of the optimizer; after each
                              call the state is modified

 RETURN:
 - `x`     : the new x vector
 - `f(x)`  : the function, evaluated before the update

 ]]

 function optim.adam(opfunc, x, config, state)
   -- (0) get/update state
   local config = config or {}
   local state = state or config
   local lr = config.learningRate or 0.001
   local lrd = config.learningRateDecay or 0

   local beta1 = config.beta1 or 0.9
   local beta2 = config.beta2 or 0.999
   local epsilon = config.epsilon or 1e-8
   local wd = config.weightDecay or 0

   -- (1) evaluate f(x) and df/dx
   local fx, dfdx = opfunc(x)

   -- (2) weight decay
   if wd ~= 0 then
      dfdx:add(wd, x)
   end

   -- Initialization
   state.t = state.t or 0
   -- Exponential moving average of gradient values
   state.m = state.m or x.new(dfdx:size()):zero()
   -- Exponential moving average of squared gradient values
   state.v = state.v or x.new(dfdx:size()):zero()
   -- A tmp tensor to hold the sqrt(v) + epsilon
   state.denom = state.denom or x.new(dfdx:size()):zero()

   -- (3) learning rate decay (annealing)
   local clr = lr / (1 + state.t*lrd)

   state.t = state.t + 1

   -- Decay the first and second moment running average coefficient
   state.m:mul(beta1):add(1-beta1, dfdx)
   state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)

   state.denom:copy(state.v):sqrt():add(epsilon)

   local biasCorrection1 = 1 - beta1^state.t
   local biasCorrection2 = 1 - beta2^state.t
   local stepSize = clr * math.sqrt(biasCorrection2)/biasCorrection1
   -- (4) update x
   x:addcdiv(-stepSize, state.m, state.denom)

   -- return x*, f(x) before optimization
   return x, {fx}
 end
--- a/contrib/torch/optim/adamax.lua
+++ b/contrib/torch/optim/adamax.lua
@@ -0,0 +1,66 @@
 --[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf

 ARGS:

 - 'opfunc' : a function that takes a single input (X), the point
             of a evaluation, and returns f(X) and df/dX
 - 'x'      : the initial point
 - 'config` : a table with configuration parameters for the optimizer
 - 'config.learningRate'      : learning rate
 - 'config.beta1'             : first moment coefficient
 - 'config.beta2'             : second moment coefficient
 - 'config.epsilon'           : for numerical stability
 - 'state'                    : a table describing the state of the optimizer;
                               after each call the state is modified.

 RETURN:
 - `x`     : the new x vector
 - `f(x)`  : the function, evaluated before the update

 ]]

 function optim.adamax(opfunc, x, config, state)
   -- (0) get/update state
   local config = config or {}
   local state = state or config
   local lr = config.learningRate or 0.002

   local beta1 = config.beta1 or 0.9
   local beta2 = config.beta2 or 0.999
   local epsilon = config.epsilon or 1e-38
   local wd = config.weightDecay or 0

   -- (1) evaluate f(x) and df/dx
   local fx, dfdx = opfunc(x)

   -- (2) weight decay
   if wd ~= 0 then
      dfdx:add(wd, x)
   end

   -- Initialization
   state.t = state.t or 0
   -- Exponential moving average of gradient values
   state.m = state.m or x.new(dfdx:size()):zero()
   -- Exponential moving average of the infinity norm
   state.u = state.u or x.new(dfdx:size()):zero()
   -- A tmp tensor to hold the input to max()
   state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero()

   state.t = state.t + 1

   -- Update biased first moment estimate.
   state.m:mul(beta1):add(1-beta1, dfdx)
   -- Update the exponentially weighted infinity norm.
   state.max[1]:copy(state.u):mul(beta2)
   state.max[2]:copy(dfdx):abs():add(epsilon)
   state.u:max(state.max, 1)

   local biasCorrection1 = 1 - beta1^state.t
   local stepSize = lr/biasCorrection1
   -- (2) update x
   x:addcdiv(-stepSize, state.m, state.u)

   -- return x*, f(x) before optimization
   return x, {fx}
 end
--- a/contrib/torch/optim/asgd.lua
+++ b/contrib/torch/optim/asgd.lua
@@ -0,0 +1,73 @@
 --[[ An implementation of ASGD

 ASGD:

       x := (1 - lambda eta_t) x - eta_t df/dx(z,x)
       a := a + mu_t [ x - a ]

    eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
     mu_t = 1/max(1,t-t0)

 implements ASGD algoritm as in L.Bottou's sgd-2.0

 ARGS:

 - `opfunc` : a function that takes a single input (X), the point of
         evaluation, and returns f(X) and df/dX
 - `x`      : the initial point
 - `state`  : a table describing the state of the optimizer; after each
         call the state is modified
 - `state.eta0`   : learning rate
 - `state.lambda` : decay term
 - `state.alpha`  : power for eta update
 - `state.t0`     : point at which to start averaging

 RETURN:
 - `x`     : the new x vector
 - `f(x)`  : the function, evaluated before the update
 - `ax`    : the averaged x vector

 (Clement Farabet, 2012)
 --]]
 function optim.asgd(opfunc, x, config, state)
   -- (0) get/update state
   local config = config or {}
   local state = state or config
   config.eta0 = config.eta0 or 1e-4
   config.lambda = config.lambda or 1e-4
   config.alpha = config.alpha or 0.75
   config.t0 = config.t0 or 1e6

   -- (hidden state)
   state.eta_t = state.eta_t or config.eta0
   state.mu_t = state.mu_t or 1
   state.t = state.t or 0

   -- (1) evaluate f(x) and df/dx
   local fx,dfdx = opfunc(x)

   -- (2) decay term
   x:mul(1 - config.lambda*state.eta_t)

   -- (3) update x
   x:add(-state.eta_t, dfdx)

   -- (4) averaging
   state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero()
   state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax)
   if state.mu_t ~= 1 then
      state.tmp:copy(x)
      state.tmp:add(-1,state.ax):mul(state.mu_t)
      state.ax:add(state.tmp)
   else
      state.ax:copy(x)
   end

   -- (5) update eta_t and mu_t
   state.t = state.t + 1
   state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha)
   state.mu_t = 1 / math.max(1, state.t - config.t0)

   -- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...)
   return x,{fx},state.ax
 end
--- a/contrib/torch/optim/cg.lua
+++ b/contrib/torch/optim/cg.lua
@@ -0,0 +1,208 @@
 --[[

 This cg implementation is a rewrite of minimize.m written by Carl
 E. Rasmussen. It is supposed to produce exactly same results (give
 or take numerical accuracy due to some changed order of
 operations). You can compare the result on rosenbrock with minimize.m.
 http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html

    [x fx c] = minimize([0 0]', 'rosenbrock', -25)

 Note that we limit the number of function evaluations only, it seems much
 more important in practical use.

 ARGS:

 - `opfunc` : a function that takes a single input, the point of evaluation.
 - `x`      : the initial point
 - `state` : a table of parameters and temporary allocations.
 - `state.maxEval`     : max number of function evaluations
 - `state.maxIter`     : max number of iterations
 - `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage
 - `state.[s,x0]`      : if you pass torch.Tensor they will be used for temp storage

 RETURN:

 - `x*` : the new x vector, at the optimal point
 - `f`  : a table of all function values where
     `f[1]` is the value of the function before any optimization and
     `f[#f]` is the final fully optimized value, at x*

 (Koray Kavukcuoglu, 2012)
 --]]
 function optim.cg(opfunc, x, config, state)
   -- parameters
   local config = config or {}
   local state = state or config
   local rho  = config.rho or 0.01
   local sig  = config.sig or 0.5
   local int  = config.int or 0.1
   local ext  = config.ext or 3.0
   local maxIter  = config.maxIter or 20
   local ratio = config.ratio or 100
   local maxEval = config.maxEval or maxIter*1.25
   local red = 1

   local verbose = config.verbose or 0

   local i = 0
   local ls_failed = 0
   local fx  = {}

   -- we need three points for the interpolation/extrapolation stuff
   local z1,z2,z3 = 0,0,0
   local d1,d2,d3 = 0,0,0
   local f1,f2,f3 = 0,0,0

   local df1 = state.df1 or x.new()
   local df2 = state.df2 or x.new()
   local df3 = state.df3 or x.new()
   local tdf

   df1:resizeAs(x)
   df2:resizeAs(x)
   df3:resizeAs(x)

   -- search direction
   local s = state.s or x.new()
   s:resizeAs(x)

   -- we need a temp storage for X
   local x0 = state.x0 or x.new()
   local f0 = 0
   local df0 = state.df0 or x.new()
   x0:resizeAs(x)
   df0:resizeAs(x)

   -- evaluate at initial point
   f1,tdf = opfunc(x)
   fx[#fx+1] = f1
   df1:copy(tdf)
   i=i+1

   -- initial search direction
   s:copy(df1):mul(-1)

   d1 = -s:dot(s )         -- slope
   z1 = red/(1-d1)         -- initial step

   while i < math.abs(maxEval) do

      x0:copy(x)
      f0 = f1
      df0:copy(df1)

      x:add(z1,s)
      f2,tdf = opfunc(x)
      df2:copy(tdf)
      i=i+1
      d2 = df2:dot(s)
      f3,d3,z3 = f1,d1,-z1   -- init point 3 equal to point 1
      local m = math.min(maxIter,maxEval-i)
      local success = 0
      local limit = -1

      while true do
         while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do
            limit = z1
            if f2 > f1 then
               z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3)
            else
               local A = 6*(f2-f3)/z3+3*(d2+d3)
               local B = 3*(f3-f2)-z3*(d3+2*d2)
               z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A
            end
            if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then
               z2 = z3/2;
            end
            z2 = math.max(math.min(z2, int*z3),(1-int)*z3);
            z1 = z1 + z2;
            x:add(z2,s)
            f2,tdf = opfunc(x)
            df2:copy(tdf)
            i=i+1
            m = m - 1
            d2 = df2:dot(s)
            z3 = z3-z2;
         end
         if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then
            break
         elseif d2 > sig*d1 then
            success = 1;
            break;
         elseif m == 0 then
            break;
         end
         local A = 6*(f2-f3)/z3+3*(d2+d3);
         local B = 3*(f3-f2)-z3*(d3+2*d2);
         z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3))

         if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then
            if limit < -0.5 then
               z2 = z1 * (ext -1)
            else
               z2 = (limit-z1)/2
            end
         elseif (limit > -0.5) and (z2+z1) > limit then
            z2 = (limit-z1)/2
         elseif limit < -0.5 and (z2+z1) > z1*ext then
            z2 = z1*(ext-1)
         elseif z2 < -z3*int then
            z2 = -z3*int
         elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then
            z2 = (limit-z1)*(1-int)
         end
         f3=f2; d3=d2; z3=-z2;
         z1 = z1+z2;
         x:add(z2,s)

         f2,tdf = opfunc(x)
         df2:copy(tdf)
         i=i+1
         m = m - 1
         d2 = df2:dot(s)
      end
      if success == 1 then
         f1 = f2
         fx[#fx+1] = f1;
         local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1)
         s:mul(ss)
         s:add(-1,df2)
         local tmp = df1:clone()
         df1:copy(df2)
         df2:copy(tmp)
         d2 = df1:dot(s)
         if d2> 0 then
            s:copy(df1)
            s:mul(-1)
            d2 = -s:dot(s)
         end

         z1 = z1 * math.min(ratio, d1/(d2-1e-320))
         d1 = d2
         ls_failed = 0
      else
         x:copy(x0)
         f1 = f0
         df1:copy(df0)
         if ls_failed or i>maxEval then
            break
         end
         local tmp = df1:clone()
         df1:copy(df2)
         df2:copy(tmp)
         s:copy(df1)
         s:mul(-1)
         d1 = -s:dot(s)
         z1 = 1/(1-d1)
         ls_failed = 1
      end
   end
   state.df0 = df0
   state.df1 = df1
   state.df2 = df2
   state.df3 = df3
   state.x0 = x0
   state.s = s
   return x,fx,i
 end
--- a/contrib/torch/optim/checkgrad.lua
+++ b/contrib/torch/optim/checkgrad.lua
@@ -0,0 +1,52 @@
 --[[ An implementation of a simple numerical gradient checker.

 ARGS:

 - `opfunc` : a function that takes a single input (X), the point of
         evaluation, and returns f(X) and df/dX
 - `x` : the initial point
 - `eps` : the epsilon to use for the numerical check (default is 1e-7)

 RETURN:

 - `diff` : error in the gradient, should be near tol
 - `dC` : exact gradient at point
 - `dC_est` : numerically estimates gradient at point

 ]]--


 -- function that numerically checks gradient of NCA loss:
 function optim.checkgrad(opfunc, x, eps)

    -- compute true gradient:
    local Corg,dC = opfunc(x)
    dC:resize(x:size())

    local Ctmp -- temporary value
    local isTensor = torch.isTensor(Corg)
    if isTensor then
          Ctmp = Corg.new(Corg:size())
    end

    -- compute numeric approximations to gradient:
    local eps = eps or 1e-7
    local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC)
    for i = 1,dC:size(1) do
      local tmp = x[i]
      x[i] = x[i] + eps
      local C1 = opfunc(x)
      if isTensor then
          Ctmp:copy(C1)
          C1 = Ctmp
      end
      x[i] = x[i] - 2 * eps
      local C2 = opfunc(x)
      x[i] = tmp
      dC_est[i] = (C1 - C2) / (2 * eps)
    end

    -- estimate error of gradient:
    local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est)
    return diff,dC,dC_est
 end
--- a/contrib/torch/optim/cmaes.lua
+++ b/contrib/torch/optim/cmaes.lua
@@ -0,0 +1,270 @@
 require 'torch'
 require 'math'

 local BestSolution = {}
 --[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy),
 ported from https://www.lri.fr/~hansen/barecmaes2.html.

 Parameters
 ----------
 ARGS:

 -    `opfunc` : a function that takes a single input (X), the point of
               evaluation, and returns f(X) and df/dX. Note that df/dX is not used
 -    `x` : the initial point
 -    `state.sigma`
            float, initial step-size (standard deviation in each
            coordinate)
 -    `state.maxEval`
            int, maximal number of function evaluations
 -    `state.ftarget`
            float, target function value
 -    `state.popsize`
          population size. If this is left empty,
            4 + int(3 * log(|x|)) will be used
 -    `state.ftarget`
            stop if fitness < ftarget
 -    `state.verb_disp`
            int, display on console every verb_disp iteration, 0 for never

 RETURN:
 - `x*` : the new `x` vector, at the optimal point
 - `f`  : a table of all function values:
       `f[1]` is the value of the function before any optimization and
       `f[#f]` is the final fully optimized value, at `x*`
 --]]
 function optim.cmaes(opfunc, x, config, state)
   if  (x.triu == nil or x.diag == nil) then
      error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x")
   end
   -- process input parameters
   local config = config or {}
   local state = state or config
   local xmean = x:clone():view(-1) -- distribution mean, a flattened copy
   local N = xmean:size(1)  -- number of objective variables/problem dimension
   local sigma = state.sigma -- coordinate wise standard deviation (step size)
   local ftarget = state.ftarget -- stop if fitness < ftarget
   local maxEval = tonumber(state.maxEval) or 1e3*N^2
   local objfunc = opfunc
   local verb_disp = state.verb_disp -- display step size
   local min_iterations = state.min_iterations or 1

   local lambda = state.popsize -- population size, offspring number
   -- Strategy parameter setting: Selection
   if state.popsize == nil then
      lambda = 4 + math.floor(3 * math.log(N))
   end

   local mu = lambda / 2  -- number of parents/points for recombination
   local weights = torch.range(0,mu-1):apply(function(i)
       return math.log(mu+0.5) - math.log(i+1)  end) -- recombination weights
   weights:div(weights:sum())  -- normalize recombination weights array
   local mueff = weights:sum()^2 / torch.pow(weights,2):sum()  -- variance-effectiveness of sum w_i x_i
   weights = weights:typeAs(x)

   -- Strategy parameter setting: Adaptation
   local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N)  -- time constant for cumulation for C
   local cs = (mueff + 2) / (N + mueff + 5)  -- t-const for cumulation for sigma control
   local c1 = 2 / ((N + 1.3)^2 + mueff)  -- learning rate for rank-one update of C
   local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff))  -- and for rank-mu update
   local damps = 2 * mueff/lambda + 0.3 + cs  -- damping for sigma, usually close to 1

   -- Initialize dynamic (internal) state variables
   local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C
   local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma
   local B = torch.eye(N):typeAs(x)   -- B defines the coordinate system
   local D = torch.Tensor(N):fill(1):typeAs(x)  -- diagonal D defines the scaling
   local C = torch.eye(N):typeAs(x)   -- covariance matrix
   if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig
      error('torch.symeig not available for ' .. x:type() ..
         " please use Float- or DoubleTensor for x")
   end
   local candidates = torch.Tensor(lambda,N):typeAs(x)
   local invsqrtC = torch.eye(N):typeAs(x)  -- C^-1/2
   local eigeneval = 0      -- tracking the update of B and D
   local counteval = 0
   local f_hist = {[1]=opfunc(x)}  -- for bookkeeping output and termination
   local fitvals = torch.Tensor(lambda)-- fitness values
   local best = BestSolution.new(nil,nil,counteval)
   local iteration = 0 -- iteration of the optimize loop


   local function ask()
      --[[return a list of lambda candidate solutions according to
       m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I)
       --]]
      -- Eigendecomposition: first update B, D and invsqrtC from C
      -- postpone in case to achieve O(N^2)
      if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then
         eigeneval = counteval
         C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry
         D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3)
         D = torch.sqrt(D)  -- D contains standard deviations now
         invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t())
      end
      for k=1,lambda do --repeat lambda times
         local z = D:clone():normal(0,1):cmul(D)
         candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma)
      end

      return candidates
   end


   local function tell(arx)
      --[[update the evolution paths and the distribution parameters m,
      sigma, and C within CMA-ES.

      Parameters
      ----------
            `arx`
                  a list of solutions, presumably from `ask()`
            `fitvals`
                  the corresponding objective function values --]]
      -- bookkeeping, preparation
      counteval = counteval + lambda  -- slightly artificial to do here
      local xold = xmean:clone()

      -- Sort by fitness and compute weighted mean into xmean
      local arindex = nil --sorted indices
      fitvals, arindex = torch.sort(fitvals)
      arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions

      table.insert(f_hist, fitvals[1]) --append best fitness to history
      best:update(arx[1], fitvals[1], counteval)

      xmean:zero()
      xmean:addmv(arx:t(), weights) --dot product

      -- Cumulation: update evolution paths
      local y = xmean - xold
      local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold)

      local c = (cs * (2-cs) * mueff)^0.5 / sigma
      ps = ps - ps * cs + z * c -- exponential decay on ps
      local hsig = (torch.sum(torch.pow(ps,2)) /
         (1-(1-cs)^(2*counteval/lambda)) / N  < 2 + 4./(N+1))
      hsig = hsig and 1.0 or 0.0 --use binary numbers

      c = (cc * (2-cc) * mueff)^0.5 / sigma
      pc = pc - pc * cc + y * c * hsig -- exponential decay on pc

      -- Adapt covariance matrix C
      local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc)
      -- for a minor adjustment to the variance loss by hsig
      for i=1,N do
         for j=1,N do
            local r = torch.range(1,mu)
            r:apply(function(k)
               return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end)
            local Cmuij = torch.sum(r) / sigma^2  -- rank-mu update
            C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] +
                  c1 * pc[i]*pc[j] + cmu * Cmuij)
            end
         end

         -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82
         sigma = sigma * math.exp(math.min(0.6,
               (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2))
   end

   local function stop()
      --[[return satisfied termination conditions in a table like
      {'termination reason':value, ...}, for example {'tolfun':1e-12},
      or the empty table {}--]]
      local res = {}
      if counteval > 0 then
         if counteval >= maxEval then
            res['evals'] = maxEval
         end
         if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then
            res['ftarget'] = ftarget
         end
         if torch.max(D) > 1e7 * torch.min(D) then
            res['condition'] = 1e7
         end
         if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then
            res['tolfun'] = 1e-12
         end
         if sigma * torch.max(D) < 1e-11 then
            -- remark: max(D) >= max(diag(C))^0.5
            res['tolx'] = 1e-11
         end
      end
      return res
   end

   local function disp(verb_modulo)
      --[[display some iteration info--]]
      if verb_disp == 0 then
         return nil
      end
      local iteration = counteval / lambda

      if iteration == 1 or iteration % (10*verb_modulo) == 0 then
         print('evals:\t ax-ratio max(std)   f-value')
      end
      if iteration <= 2 or iteration % verb_modulo == 0 then
         local max_std = math.sqrt(torch.max(torch.diag(C)))
         print(tostring(counteval).. ': ' ..
            string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std)
            .. tostring(fitvals[1]))
      end

      return nil
   end

   while next(stop()) == nil or iteration < min_iterations do
      iteration = iteration + 1

      local X = ask() -- deliver candidate solutions
      for i=1, lambda do
         -- put candidate tensor back in input shape and evaluate in opfunc
         local candidate = X[i]:viewAs(x)
         fitvals[i] = objfunc(candidate)
      end

      tell(X)
      disp(verb_disp)
   end

   local bestmu, f, c = best:get()
   if verb_disp > 0 then
      for k, v in pairs(stop()) do
         print('termination by', k, '=', v)
      end
      print('best f-value =', f)
      print('solution = ')
      print(bestmu)
      print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration)
   end
   table.insert(f_hist, f)

   return bestmu, f_hist, counteval
 end



 BestSolution.__index = BestSolution
 function BestSolution.new(x, f, evals)
   local self = setmetatable({}, BestSolution)
   self.x = x
   self.f = f
   self.evals = evals
   return self
 end

 function BestSolution.update(self, arx, arf, evals)
   --[[initialize the best solution with `x`, `f`, and `evals`.
      Better solutions have smaller `f`-values.--]]
   if self.f == nil or arf < self.f then
      self.x = arx:clone()
      self.f = arf
      self.evals = evals
   end
   return self
 end

 function BestSolution.get(self)
   return self.x, self.f, self.evals
 end
--- a/contrib/torch/optim/de.lua
+++ b/contrib/torch/optim/de.lua
@@ -0,0 +1,109 @@
 --[[ An implementation of `DE` (Differential Evolution),

 ARGS:

    -`opfunc` : a function that takes a single input (X), the point of
    evaluation, and returns f(X) and df/dX. Note that df/dX is not used
    -`x` : 		the initial point
    -`state.popsize`: 			population size. If this is left empty, 10*d will be used
    -`state.scaleFactor`: 		float, usually between 0.4 and 1
    -`state.crossoverRate`:		float, usually between 0.1 and 0.9
    -`state.maxEval`:			int, maximal number of function evaluations

 RETURN:
    - `x*` : the new `x` vector, at the optimal point
    - `f`  : a table of all function values:
    `f[1]` is the value of the function before any optimization and
    `f[#f]` is the final fully optimized value, at `x*`
 ]]

 require 'torch'

 function optim.de(opfunc, x, config, state)
    -- process input parameters
    local config = config or {}
    local state = state
    local popsize = config.popsize			-- population size
    local scaleFactor = config.scaleFactor	 	-- scale factor
    local crossoverRate = config.crossoverRate	-- crossover rate
    local maxFEs = tonumber(config.maxFEs)		-- maximal number of function evaluations
    local maxRegion = config.maxRegion	        -- upper bound of search region
    local minRegion = config.minRegion		-- lower bound of search region
    local xmean = x:clone():view(-1) 		-- distribution mean, a flattened copy
    local D = xmean:size(1)  			-- number of objective variables/problem dimension

    if config.popsize == nil then
 	popsize = 10 * D
    end
    if config.maxRegion == nil then
 	maxRegion = 30
    end
    if config.minRegion == nil then
 	minRegion = -30
    end

    -- Initialize population
    local fx = x.new(maxFEs)
    local pop = x.new(popsize, D)
    local children = x.new(popsize, D)
    local fitness = x.new(popsize)
    local children_fitness = x.new(popsize)
    local fes = 1	-- number of function evaluations
    local best_fitness
    local best_solution = x.new(D)

    -- Initialize population and evaluate the its fitness value
    local gen = torch.Generator()
    torch.manualSeed(gen, 1)

    pop:uniform(gen, minRegion, maxRegion)
    for i = 1, popsize do
 	fitness[i] = opfunc(pop[i])
 	fx[fes] = fitness[i]
 	fes = fes + 1
    end

    -- Find the best solution
    local index
    best_fitness, index = fitness:max(1)
    best_fitness = best_fitness[1]
    index = index[1]
    best_solution:copy(pop[index])

    -- Main loop
    while fes < maxFEs do
 	local  r1, r2
 	for i = 1, popsize do
 	    repeat
 		r1 = torch.random(gen, 1, popsize)
 	    until(r1 ~= i)
 	    repeat
 		r2 = torch.random(gen, 1, popsize)
 	    until(r2 ~= r1 and r2 ~= i)

 	    local jrand = torch.random(gen, 1, D)
 	    for j = 1, D do
 		if torch.uniform(gen, 0, 1) < crossoverRate or i == jrand then
 		    children[i][j] = best_solution[j] + scaleFactor * (pop[r1][j] - pop[r2][j])
 		else
 		    children[i][j] = pop[i][j]
 		end
 	    end
 	    children_fitness[i] = opfunc(children[i])
 	    fx[fes] = children_fitness[i]
 	    fes = fes + 1
 	end

 	for i = 1, popsize do
 	    if children_fitness[i] <= fitness[i] then
 		pop[i]:copy(children[i])
 		fitness[i] = children_fitness[i]
 		if fitness[i] < best_fitness then
 		    best_fitness = fitness[i]
 		    best_solution:copy(children[i])
 		end
 	    end
 	end
    end
    return best_solution, fx
 end
--- a/contrib/torch/optim/fista.lua
+++ b/contrib/torch/optim/fista.lua
@@ -0,0 +1,192 @@
 --[[ FISTA with backtracking line search

 - `f`        : smooth function
 - `g`        : non-smooth function
 - `pl`       : minimizer of intermediate problem Q(x,y)
 - `xinit`    : initial point
 - `params`   : table of parameters (**optional**)
 - `params.L`       : 1/(step size) for ISTA/FISTA iteration (0.1)
 - `params.Lstep`   : step size multiplier at each iteration (1.5)
 - `params.maxiter` : max number of iterations (50)
 - `params.maxline` : max number of line search iterations per iteration (20)
 - `params.errthres`: Error thershold for convergence check (1e-4)
 - `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true)
 - `params.verbose` : store each iteration solution and print detailed info (false)

 On output, `params` will contain these additional fields that can be reused.

 - `params.L`       : last used L value will be written.

 These are temporary storages needed by the algo and if the same params object is
 passed a second time, these same storages will be used without new allocation.

 - `params.xkm`     : previous iterarion point
 - `params.y`       : fista iteration
 - `params.ply`     : ply = pl(y - 1/L grad(f))

 Returns the solution x and history of {function evals, number of line search ,...}

 Algorithm is published in

    @article{beck-fista-09,
       Author = {Beck, Amir and Teboulle, Marc},
       Journal = {SIAM J. Img. Sci.},
       Number = {1},
       Pages = {183--202},
       Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems},
       Volume = {2},
       Year = {2009}}
 ]]
 function optim.FistaLS(f, g, pl, xinit, params)

   local params = params or {}
   local L = params.L or 0.1
   local Lstep = params.Lstep or 1.5
   local maxiter = params.maxiter or 50
   local maxline = params.maxline or 20
   local errthres = params.errthres or 1e-4
   local doFistaUpdate = params.doFistaUpdate
   local verbose = params.verbose

   -- temporary allocations
   params.xkm = params.xkm or torch.Tensor()
   params.y   = params.y   or torch.Tensor()
   params.ply = params.ply or torch.Tensor()
   local xkm = params.xkm  -- previous iteration
   local y   = params.y    -- fista iteration
   local ply = params.ply  -- soft shrinked y

   -- we start from all zeros
   local xk = xinit
   xkm:resizeAs(xk):zero()
   ply:resizeAs(xk):zero()
   y:resizeAs(xk):zero()

   local history = {} -- keep track of stuff
   local niter = 0    -- number of iterations done
   local converged = false  -- are we done?
   local tk = 1      -- momentum param for FISTA
   local tkp = 0


   local gy = g(y)
   local fval = math.huge -- fval = f+g
   while not converged and niter < maxiter do

      -- run through smooth function (code is input, input is target)
      -- get derivatives from smooth function
      local fy,gfy = f(y,'dx')
      --local gfy = f(y)

      local fply = 0
      local gply = 0
      local Q = 0

      ----------------------------------------------
      -- do line search to find new current location starting from fista loc
      local nline = 0
      local linesearchdone = false
      while not linesearchdone do
         -- take a step in gradient direction of smooth function
         ply:copy(y)
         ply:add(-1/L,gfy)

         -- and solve for minimum of auxiliary problem
         pl(ply,L)
         -- this is candidate for new current iteration
         xk:copy(ply)

         -- evaluate this point F(ply)
         fply = f(ply)

         -- ply - y
         ply:add(-1, y)
         -- <ply-y , \Grad(f(y))>
         local Q2 = gfy:dot(ply)
         -- L/2 ||beta-y||^2
         local Q3 = L/2 * ply:dot(ply)
         -- Q(beta,y) = F(y) + <beta-y , \Grad(F(y))> + L/2||beta-y||^2 + G(beta)
         Q = fy + Q2 + Q3

         if verbose then
            print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3))
         end
         -- check if F(beta) < Q(pl(y),\t)
         if fply <= Q then --and Fply + Gply <= F then
            -- now evaluate G here
            linesearchdone = true
         elseif  nline >= maxline then
            linesearchdone = true
            xk:copy(xkm) -- if we can't find a better point, current iter = previous iter
            --print('oops')
         else
            L = L * Lstep
         end
         nline = nline + 1
      end
      -- end line search
      ---------------------------------------------

      ---------------------------------------------
      -- FISTA
      ---------------------------------------------
      if doFistaUpdate then
         -- do the FISTA step
         tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2
         -- x(k-1) = x(k-1) - x(k)
         xkm:add(-1,xk)
         -- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k))
         y:copy(xk)
         y:add( (1-tk)/tkp , xkm)
         -- store for next iterations
         -- x(k-1) = x(k)
         xkm:copy(xk)
      else
         y:copy(xk)
      end
      -- t(k) = t(k+1)
      tk = tkp
      fply = f(y)
      gply = g(y)
      if verbose then
 	 print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply))
      end

      niter = niter + 1

      -- bookeeping
      fval = fply + gply
      history[niter] = {}
      history[niter].nline = nline
      history[niter].L  = L
      history[niter].F  = fval
      history[niter].Fply = fply
      history[niter].Gply = gply
      history[niter].Q  = Q
      params.L = L
      if verbose then
         history[niter].xk = xk:clone()
         history[niter].y  = y:clone()
      end

      -- are we done?
      if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then
         converged = true
 	 xinit:copy(y)
         return y,history
      end

      if niter >= maxiter then
 	 xinit:copy(y)
         return y,history
      end

      --if niter > 1 and history[niter].F > history[niter-1].F then
      --print(niter, 'This was supposed to be a convex function, we are going up')
      --converged = true
      --return xk,history
      --end
   end
   error('not supposed to be here')
 end

--- a/contrib/torch/optim/init.lua
+++ b/contrib/torch/optim/init.lua
@@ -0,0 +1,33 @@

 require 'torch'

 optim = {}

 -- optimizations
 require('optim.sgd')
 require('optim.cg')
 require('optim.asgd')
 require('optim.nag')
 require('optim.fista')
 require('optim.lbfgs')
 require('optim.adagrad')
 require('optim.rprop')
 require('optim.adam')
 require('optim.adamax')
 require('optim.rmsprop')
 require('optim.adadelta')
 require('optim.cmaes')
 require('optim.de')

 -- line search functions
 require('optim.lswolfe')

 -- helpers
 require('optim.polyinterp')
 require('optim.checkgrad')

 -- tools
 require('optim.ConfusionMatrix')
 require('optim.Logger')

 return optim
--- a/contrib/torch/optim/lbfgs.lua
+++ b/contrib/torch/optim/lbfgs.lua
@@ -0,0 +1,268 @@
 --[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt)

 This implementation of L-BFGS relies on a user-provided line
 search function (state.lineSearch). If this function is not
 provided, then a simple learningRate is used to produce fixed
 size steps. Fixed size steps are much less costly than line
 searches, and can be useful for stochastic problems.

 The learning rate is used even when a line search is provided.
 This is also useful for large-scale stochastic problems, where
 opfunc is a noisy approximation of f(x). In that case, the learning
 rate allows a reduction of confidence in the step size.

 ARGS:

 - `opfunc` : a function that takes a single input (X), the point of
         evaluation, and returns f(X) and df/dX
 - `x` : the initial point
 - `state` : a table describing the state of the optimizer; after each
         call the state is modified
 - `state.maxIter` : Maximum number of iterations allowed
 - `state.maxEval` : Maximum number of function evaluations
 - `state.tolFun` : Termination tolerance on the first-order optimality
 - `state.tolX` : Termination tol on progress in terms of func/param changes
 - `state.lineSearch` : A line search function
 - `state.learningRate` : If no line search provided, then a fixed step size is used

 RETURN:
 - `x*` : the new `x` vector, at the optimal point
 - `f`  : a table of all function values:
     `f[1]` is the value of the function before any optimization and
     `f[#f]` is the final fully optimized value, at `x*`

 (Clement Farabet, 2012)
 ]]
 function optim.lbfgs(opfunc, x, config, state)
   -- get/update state
   local config = config or {}
   local state = state or config
   local maxIter = tonumber(config.maxIter) or 20
   local maxEval = tonumber(config.maxEval) or maxIter*1.25
   local tolFun = config.tolFun or 1e-5
   local tolX = config.tolX or 1e-9
   local nCorrection = config.nCorrection or 100
   local lineSearch = config.lineSearch
   local lineSearchOpts = config.lineSearchOptions
   local learningRate = config.learningRate or 1
   local isverbose = config.verbose or false

   state.funcEval = state.funcEval or 0
   state.nIter = state.nIter or 0

   -- verbose function
   local verbose
   if isverbose then
      verbose = function(...) print('<optim.lbfgs> ', ...) end
   else
      verbose = function() end
   end

   -- import some functions
   local abs = math.abs
   local min = math.min

   -- evaluate initial f(x) and df/dx
   local f,g = opfunc(x)
   local f_hist = {f}
   local currentFuncEval = 1
   state.funcEval = state.funcEval + 1
   local p = g:size(1)

   -- check optimality of initial point
   state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1
   tmp1:copy(g):abs()
   if tmp1:sum() <= tolFun then
      -- optimality condition below tolFun
      verbose('optimality condition below tolFun')
      return x,f_hist
   end

   if not state.dir_bufs then
      -- reusable buffers for y's and s's, and their histories
      verbose('creating recyclable direction/step/history buffers')
      state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1)
      state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1)
      for i=1,#state.dir_bufs do
         state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1)
         state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1)
      end
   end

   -- variables cached in state (for tracing)
   local d = state.d
   local t = state.t
   local old_dirs = state.old_dirs
   local old_stps = state.old_stps
   local Hdiag = state.Hdiag
   local g_old = state.g_old
   local f_old = state.f_old

   -- optimize for a max of maxIter iterations
   local nIter = 0
   while nIter < maxIter do
      -- keep track of nb of iterations
      nIter = nIter + 1
      state.nIter = state.nIter + 1

      ------------------------------------------------------------
      -- compute gradient descent direction
      ------------------------------------------------------------
      if state.nIter == 1 then
         d = g:clone():mul(-1) -- -g
         old_dirs = {}
         old_stps = {}
         Hdiag = 1
      else
         -- do lbfgs update (update memory)
         local y = table.remove(state.dir_bufs)  -- pop
         local s = table.remove(state.stp_bufs)
         y:add(g, -1, g_old)  -- g - g_old
         s:mul(d, t)          -- d*t
         local ys = y:dot(s)  -- y*s
         if ys > 1e-10 then
            -- updating memory
            if #old_dirs == nCorrection then
               -- shift history by one (limited-memory)
               local removed1 = table.remove(old_dirs, 1)
               local removed2 = table.remove(old_stps, 1)
               table.insert(state.dir_bufs, removed1)
               table.insert(state.stp_bufs, removed2)
            end

            -- store new direction/step
            table.insert(old_dirs, s)
            table.insert(old_stps, y)

            -- update scale of initial Hessian approximation
            Hdiag = ys / y:dot(y)  -- (y*y)
         else
            -- put y and s back into the buffer pool
            table.insert(state.dir_bufs, y)
            table.insert(state.stp_bufs, s)
         end

         -- compute the approximate (L-BFGS) inverse Hessian
         -- multiplied by the gradient
         local k = #old_dirs

         -- need to be accessed element-by-element, so don't re-type tensor:
         state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro
         for i = 1,k do
            ro[i] = 1 / old_stps[i]:dot(old_dirs[i])
         end

         -- iteration in L-BFGS loop collapsed to use just one buffer
         local q = tmp1  -- reuse tmp1 for the q buffer
         -- need to be accessed element-by-element, so don't re-type tensor:
         state.al = state.al or torch.zeros(nCorrection) local al = state.al

         q:mul(g, -1)  -- -g
         for i = k,1,-1 do
            al[i] = old_dirs[i]:dot(q) * ro[i]
            q:add(-al[i], old_stps[i])
         end

         -- multiply by initial Hessian
         r = d  -- share the same buffer, since we don't need the old d
         r:mul(q, Hdiag)  -- q[1] * Hdiag
         for i = 1,k do
            local be_i = old_stps[i]:dot(r) * ro[i]
            r:add(al[i]-be_i, old_dirs[i])
         end
         -- final direction is in r/d (same object)
      end
      g_old = g_old or g:clone()
      g_old:copy(g)
      f_old = f

      ------------------------------------------------------------
      -- compute step length
      ------------------------------------------------------------
      -- directional derivative
      local gtd = g:dot(d)  -- g * d

      -- check that progress can be made along that direction
      if gtd > -tolX then
         break
      end

      -- reset initial guess for step size
      if state.nIter == 1 then
         tmp1:copy(g):abs()
         t = min(1,1/tmp1:sum()) * learningRate
      else
         t = learningRate
      end

      -- optional line search: user function
      local lsFuncEval = 0
      if lineSearch and type(lineSearch) == 'function' then
         -- perform line search, using user function
         f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts)
         table.insert(f_hist, f)
      else
         -- no line search, simply move with fixed-step
         x:add(t,d)
         if nIter ~= maxIter then
            -- re-evaluate function only if not in last iteration
            -- the reason we do this: in a stochastic setting,
            -- no use to re-evaluate that function here
            f,g = opfunc(x)
            lsFuncEval = 1
            table.insert(f_hist, f)
         end
      end

      -- update func eval
      currentFuncEval = currentFuncEval + lsFuncEval
      state.funcEval = state.funcEval + lsFuncEval

      ------------------------------------------------------------
      -- check conditions
      ------------------------------------------------------------
      if nIter == maxIter then
         -- no use to run tests
         verbose('reached max number of iterations')
         break
      end

      if currentFuncEval >= maxEval then
         -- max nb of function evals
         verbose('max nb of function evals')
         break
      end

      tmp1:copy(g):abs()
      if tmp1:sum() <= tolFun then
         -- check optimality
         verbose('optimality condition below tolFun')
         break
      end

      tmp1:copy(d):mul(t):abs()
      if tmp1:sum() <= tolX then
         -- step size below tolX
         verbose('step size below tolX')
         break
      end

      if abs(f-f_old) < tolX then
         -- function value changing less than tolX
         verbose('function value changing less than tolX')
         break
      end
   end

   -- save state
   state.old_dirs = old_dirs
   state.old_stps = old_stps
   state.Hdiag = Hdiag
   state.g_old = g_old
   state.f_old = f_old
   state.t = t
   state.d = d

   -- return optimal x, and history of f(x)
   return x,f_hist,currentFuncEval
 end
--- a/contrib/torch/optim/lswolfe.lua
+++ b/contrib/torch/optim/lswolfe.lua
@@ -0,0 +1,192 @@
 --[[ A Line Search satisfying the Wolfe conditions

 ARGS:
 - `opfunc` : a function (the objective) that takes a single input (X),
         the point of evaluation, and returns f(X) and df/dX
 - `x`          : initial point / starting location
 - `t`          : initial step size
 - `d`          : descent direction
 - `f`          : initial function value
 - `g`          : gradient at initial location
 - `gtd`        : directional derivative at starting location
 - `options.c1` : sufficient decrease parameter
 - `options.c2` : curvature parameter
 - `options.tolX`    : minimum allowable step length
 - `options.maxIter` : maximum nb of iterations

 RETURN:
 - `f`          : function value at x+t*d
 - `g`          : gradient value at x+t*d
 - `x`          : the next x (=x+t*d)
 - `t`          : the step length
 - `lsFuncEval` : the number of function evaluations
 ]]
 function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options)
   -- options
   options = options or {}
   local c1 = options.c1 or 1e-4
   local c2 = options.c2 or 0.9
   local tolX = options.tolX or 1e-9
   local maxIter = options.maxIter or 20
   local isverbose = options.verbose or false

   -- some shortcuts
   local abs = torch.abs
   local min = math.min
   local max = math.max

   -- verbose function
   local function verbose(...)
      if isverbose then print('<optim.lswolfe> ', ...) end
   end

   -- evaluate objective and gradient using initial step
   local x_init = x:clone()
   x:add(t,d)
   local f_new,g_new = opfunc(x)
   local lsFuncEval = 1
   local gtd_new = g_new * d

   -- bracket an interval containing a point satisfying the Wolfe
   -- criteria
   local LSiter,t_prev,done = 0,0,false
   local f_prev,g_prev,gtd_prev = f,g:clone(),gtd
   local bracket,bracketFval,bracketGval
   while LSiter < maxIter do
      -- check conditions:
      if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then
         bracket = x.new{t_prev,t}
         bracketFval = x.new{f_prev,f_new}
         bracketGval = x.new(2,g_new:size(1))
         bracketGval[1] = g_prev
         bracketGval[2] = g_new
         break

      elseif abs(gtd_new) <= -c2*gtd then
         bracket = x.new{t}
         bracketFval = x.new{f_new}
         bracketGval = x.new(1,g_new:size(1))
         bracketGval[1] = g_new
         done = true
         break

      elseif gtd_new >= 0 then
         bracket = x.new{t_prev,t}
         bracketFval = x.new{f_prev,f_new}
         bracketGval = x.new(2,g_new:size(1))
         bracketGval[1] = g_prev
         bracketGval[2] = g_new
         break

      end

      -- interpolate:
      local tmp = t_prev
      t_prev = t
      local minStep = t + 0.01*(t-tmp)
      local maxStep = t*10
      t = optim.polyinterp(x.new{{tmp,f_prev,gtd_prev},
                                  {t,f_new,gtd_new}},
                           minStep, maxStep)

      -- next step:
      f_prev = f_new
      g_prev = g_new:clone()
      gtd_prev = gtd_new
      x[{}] = x_init
      x:add(t,d)
      f_new,g_new = opfunc(x)
      lsFuncEval = lsFuncEval + 1
      gtd_new = g_new * d
      LSiter = LSiter + 1
   end

   -- reached max nb of iterations?
   if LSiter == maxIter then
      bracket = x.new{0,t}
      bracketFval = x.new{f,f_new}
      bracketGval = x.new(2,g_new:size(1))
      bracketGval[1] = g
      bracketGval[2] = g_new
   end

   -- zoom phase: we now have a point satisfying the criteria, or
   -- a bracket around it. We refine the bracket until we find the
   -- exact point satisfying the criteria
   local insufProgress = false
   local LOposRemoved = 0
   while not done and LSiter < maxIter do
      -- find high and low points in bracket
      local f_LO,LOpos = bracketFval:min(1)
      LOpos = LOpos[1] f_LO = f_LO[1]
      local HIpos = -LOpos+3

      -- compute new trial value
      t = optim.polyinterp(x.new{{bracket[1],bracketFval[1],bracketGval[1]*d},
                                  {bracket[2],bracketFval[2],bracketGval[2]*d}})

      -- test what we are making sufficient progress
      if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then
         if insufProgress or t>=bracket:max() or t <= bracket:min() then
            if abs(t-bracket:max()) < abs(t-bracket:min()) then
               t = bracket:max()-0.1*(bracket:max()-bracket:min())
            else
               t = bracket:min()+0.1*(bracket:max()-bracket:min())
            end
            insufProgress = false
         else
            insufProgress = true
         end
      else
         insufProgress = false
      end

      -- Evaluate new point
      x[{}] = x_init
      x:add(t,d)
      f_new,g_new = opfunc(x)
      lsFuncEval = lsFuncEval + 1
      gtd_new = g_new * d
      LSiter = LSiter + 1
      if f_new > f + c1*t*gtd or f_new >= f_LO then
         -- Armijo condition not satisfied or not lower than lowest point
         bracket[HIpos] = t
         bracketFval[HIpos] = f_new
         bracketGval[HIpos] = g_new
      else
         if abs(gtd_new) <= - c2*gtd then
            -- Wolfe conditions satisfied
            done = true
         elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then
            -- Old HI becomes new LO
            bracket[HIpos] = bracket[LOpos]
            bracketFval[HIpos] = bracketFval[LOpos]
            bracketGval[HIpos] = bracketGval[LOpos]
         end
         -- New point becomes new LO
         bracket[LOpos] = t
         bracketFval[LOpos] = f_new
         bracketGval[LOpos] = g_new
      end

      -- done?
      if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then
         break
      end
   end

   -- be verbose
   if LSiter == maxIter then
      verbose('reached max number of iterations')
   end

   -- return stuff
   local _,LOpos = bracketFval:min(1)
   LOpos = LOpos[1]
   t = bracket[LOpos]
   f_new = bracketFval[LOpos]
   g_new = bracketGval[LOpos]
   x[{}] = x_init
   x:add(t,d)
 	return f_new,g_new,x,t,lsFuncEval
 end
--- a/contrib/torch/optim/nag.lua
+++ b/contrib/torch/optim/nag.lua
@@ -0,0 +1,86 @@
 ----------------------------------------------------------------------
 -- An implementation of SGD adapted with features of Nesterov's
 -- Accelerated Gradient method, based on the paper
 -- On the Importance of Initialization and Momentum in Deep Learning
 -- Sutsveker et. al., ICML 2013
 --
 -- ARGS:
 -- opfunc : a function that takes a single input (X), the point of
 --          evaluation, and returns f(X) and df/dX
 -- x      : the initial point
 -- state  : a table describing the state of the optimizer; after each
 --          call the state is modified
 --   state.learningRate      : learning rate
 --   state.learningRateDecay : learning rate decay
 --   state.weightDecay       : weight decay
 --   state.momentum          : momentum
 --   state.learningRates     : vector of individual learning rates
 --
 -- RETURN:
 -- x     : the new x vector
 -- f(x)  : the function, evaluated before the update
 --
 -- (Dilip Krishnan, 2013)
 --

 function optim.nag(opfunc, x, config, state)
   -- (0) get/update state
   local config = config or {}
   local state = state or config
   local lr = config.learningRate or 1e-3
   local lrd = config.learningRateDecay or 0
   local wd = config.weightDecay or 0
   local mom = config.momentum or 0.9
   local damp = config.dampening or mom
   local lrs = config.learningRates
   state.evalCounter = state.evalCounter or 0
   local nevals = state.evalCounter

   if mom <= 0 then
     error('Momentum must be positive for Nesterov Accelerated Gradient')
   end

   -- (1) evaluate f(x) and df/dx
   -- first step in the direction of the momentum vector

   if state.dfdx then
      x:add(mom, state.dfdx)
   end
   -- then compute gradient at that point
   -- comment out the above line to get the original SGD
   local fx,dfdx = opfunc(x)

   -- (2) weight decay
   if wd ~= 0 then
      dfdx:add(wd, x)
   end

   -- (3) learning rate decay (annealing)
   local clr = lr / (1 + nevals*lrd)

   -- (4) apply momentum
   if not state.dfdx then
      state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0)
   else
      state.dfdx:mul(mom)
   end

   -- (5) parameter update with single or individual learning rates
   if lrs then
      if not state.deltaParameters then
         state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
      end
      state.deltaParameters:copy(lrs):cmul(dfdx)
      x:add(-clr, state.deltaParameters)
      state.dfdx:add(-clr, state.deltaParameters)
   else
      x:add(-clr, dfdx)
      state.dfdx:add(-clr, dfdx)
   end

   -- (6) update evaluation counter
   state.evalCounter = state.evalCounter + 1

   -- return x, f(x) before optimization
   return x,{fx}
 end
--- a/contrib/torch/optim/polyinterp.lua
+++ b/contrib/torch/optim/polyinterp.lua
@@ -0,0 +1,212 @@
 local function isreal(x)
   return x == x
 end

 local function isnan(x)
   return not x == x
 end

 local function roots(c)
   local tol=1e-12
   c[torch.lt(torch.abs(c),tol)]=0

   local nonzero = torch.ne(c,0)
   if nonzero:max() == 0 then
      return 0
   end

   -- first non-zero
   local _,pos = torch.max(nonzero,1)
   pos = pos[1]
   c=c[{ {pos,-1} }]

   local nz = 0
   for i=c:size(1),1,-1 do
      if c[i] ~= 0 then
         break
      else
         nz = nz + 1
      end
   end
   c=c[{ {1,c:size(1)-nz} }]

   local n = c:size(1)-1
   if n == 1 then
      local e = c.new({{-c[2]/c[1], 0}})
      if nz > 0 then
         return torch.cat(e, c.new(nz, 2):zero(), 1)
      else
         return e
      end
   elseif n > 1 then
      local A = torch.diag(c.new(n-1):fill(1),-1)
      A[1] = -c[{ {2,n+1} }]/c[1];
      local e = torch.eig(A,'N')
      if nz > 0 then
         return torch.cat(e, c.new(nz,2):zero(), 1)
      else
         return e
      end
   else
      return c.new(nz,2):zero()
   end
 end

 local function real(x)
   if type(x) == number then return x end
   return x[{ {} , 1}]
 end

 local function imag(x)
   if type(x) == 'number' then return 0 end
   if x:nDimension() == 1 then
      return x.new(x:size(1)):zero()
   else
      return x[{ {},  2}]
   end
 end

 local function polyval(p,x)
   local pwr = p:size(1)
   if type(x) == 'number' then
      local val = 0
      p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end)
      return val
   else
      local val = x.new(x:size(1))
      p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end)
      return val
   end
 end

 ----------------------------------------------------------------------
 -- Minimum of interpolating polynomial based on function and
 -- derivative values
 --
 -- ARGS:
 -- points : N triplets (x,f,g), must be a Tensor
 -- xmin   : min value that brackets minimum (default: min of points)
 -- xmax   : max value that brackets maximum (default: max of points)
 --
 -- RETURN:
 -- minPos : position of minimum
 --
 function optim.polyinterp(points,xminBound,xmaxBound)
   -- locals
   local sqrt = torch.sqrt
   local mean = torch.mean
   local max = math.max
   local min = math.min

   -- nb of points / order of polynomial
   local nPoints = points:size(1)
   local order = nPoints*2-1

   -- returned values
   local minPos

   -- Code for most common case:
   --   + cubic interpolation of 2 points w/ function and derivative values for both
   --   + no xminBound/xmaxBound
   if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then
      -- Solution in this case (where x2 is the farthest point):
      --    d1 = g1 + g2 - 3*(f1-f2)/(x1-x2);
      --    d2 = sqrt(d1^2 - g1*g2);
      --    minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2));
      --    t_new = min(max(minPos,x1),x2);
      local minVal,minPos = points[{ {},1 }]:min(1)
      minVal = minVal[1] minPos = minPos[1]
      local notMinPos = -minPos+3;

      local d1 = points[{minPos,3}] + points[{notMinPos,3}]
               - 3*(points[{minPos,2}]-points[{notMinPos,2}])
                     / (points[{minPos,1}]-points[{notMinPos,1}]);
      local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]);

      if isreal(d2) then -- isreal()
         local t = points[{notMinPos,1}] - (points[{notMinPos,1}]
                   - points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1)
                     / (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2))

         minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}])
      else
         minPos = mean(points[{{},1}])
      end
      return minPos
   end

   -- TODO: get the code below to work!
   --error('<optim.polyinterp> extrapolation not implemented yet...')

   -- Compute Bounds of Interpolation Area
   local xmin = points[{{},1}]:min()
   local xmax = points[{{},1}]:max()
   xminBound = xminBound or xmin
   xmaxBound = xmaxBound or xmax

   -- Add constraints on function values
   local A = points.new(nPoints*2,order+1):zero()
   local b = points.new(nPoints*2,1):zero()
   for i = 1,nPoints do
      local constraint = points.new(order+1):zero()
      for j = order,0,-1 do
         constraint[order-j+1] = points[{i,1}]^j
      end
      A[i] = constraint
      b[i] = points[{i,2}]
   end

   -- Add constraints based on derivatives
   for i = 1,nPoints do
      local constraint = points.new(order+1):zero()
      for j = 1,order do
         constraint[j] = (order-j+1)*points[{i,1}]^(order-j)
      end
      A[nPoints+i] = constraint
      b[nPoints+i] = points[{i,3}]
   end

   -- Find interpolating polynomial
   local res = torch.gels(b,A)
   local params = res[{ {1,nPoints*2} }]:squeeze()

   params[torch.le(torch.abs(params),1e-12)]=0

   -- Compute Critical Points
   local dParams = points.new(order):zero();
   for i = 1,params:size(1)-1 do
      dParams[i] = params[i]*(order-i+1)
   end

   -- nan/inf?
   local nans = false
   if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then
      nans = true
   end

   local cp = torch.cat(points.new{xminBound,xmaxBound},points[{{},1}])
   if not nans then
      local cproots = roots(dParams)
      local cpi = points.new(cp:size(1),2):zero()
      cpi[{ {1,cp:size(1)} , 1 }] = cp
      cp = torch.cat(cpi,cproots,1)
   end

   -- Test Critical Points
   local fmin = math.huge
   -- Default to Bisection if no critical points valid:
   minPos = (xminBound+xmaxBound)/2
   for i = 1,cp:size(1) do
      local xCP = cp[{ {i,i} , {} }]
      local ixCP = imag(xCP)[1]
      local rxCP = real(xCP)[1]
      if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then
         local fCP = polyval(params,rxCP)
         if fCP < fmin then
            minPos = rxCP
            fmin = fCP
         end
      end
   end
   return minPos,fmin
 end
--- a/contrib/torch/optim/rmsprop.lua
+++ b/contrib/torch/optim/rmsprop.lua
@@ -0,0 +1,58 @@
 --[[ An implementation of RMSprop

 ARGS:

 - 'opfunc' : a function that takes a single input (X), the point
             of a evaluation, and returns f(X) and df/dX
 - 'x'      : the initial point
 - 'config` : a table with configuration parameters for the optimizer
 - 'config.learningRate'      : learning rate
 - 'config.alpha'             : smoothing constant
 - 'config.epsilon'           : value with which to initialise m
 - 'config.weightDecay'       : weight decay
 - 'state'                    : a table describing the state of the optimizer;
                               after each call the state is modified
 - 'state.m'                  : leaky sum of squares of parameter gradients,
 - 'state.tmp'                : and the square root (with epsilon smoothing)

 RETURN:
 - `x`     : the new x vector
 - `f(x)`  : the function, evaluated before the update

 ]]

 function optim.rmsprop(opfunc, x, config, state)
   -- (0) get/update state
   local config = config or {}
   local state = state or config
   local lr = config.learningRate or 1e-2
   local alpha = config.alpha or 0.99
   local epsilon = config.epsilon or 1e-8
   local wd = config.weightDecay or 0
   local mfill = config.initialMean or 0

   -- (1) evaluate f(x) and df/dx
   local fx, dfdx = opfunc(x)

   -- (2) weight decay
   if wd ~= 0 then
      dfdx:add(wd, x)
   end

   -- (3) initialize mean square values and square gradient storage
   if not state.m then
      state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(mfill)
      state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx)
   end

   -- (4) calculate new (leaky) mean squared values
   state.m:mul(alpha)
   state.m:addcmul(1.0-alpha, dfdx, dfdx)

   -- (5) perform update
   state.tmp:sqrt(state.m):add(epsilon)
   x:addcdiv(-lr, dfdx, state.tmp)

   -- return x*, f(x) before optimization
   return x, {fx}
 end
--- a/contrib/torch/optim/rprop.lua
+++ b/contrib/torch/optim/rprop.lua
@@ -0,0 +1,103 @@
 --[[ A plain implementation of RPROP

 ARGS:
 - `opfunc` : a function that takes a single input (X), the point of
             evaluation, and returns f(X) and df/dX
 - `x`      : the initial point
 - `state`  : a table describing the state of the optimizer; after each
             call the state is modified
 - `state.stepsize`    : initial step size, common to all components
 - `state.etaplus`     : multiplicative increase factor, > 1 (default 1.2)
 - `state.etaminus`    : multiplicative decrease factor, < 1 (default 0.5)
 - `state.stepsizemax` : maximum stepsize allowed (default 50)
 - `state.stepsizemin` : minimum stepsize allowed (default 1e-6)
 - `state.niter`       : number of iterations (default 1)

 RETURN:
 - `x`     : the new x vector
 - `f(x)`  : the function, evaluated before the update

 (Martin Riedmiller, Koray Kavukcuoglu 2013)
 --]]
 function optim.rprop(opfunc, x, config, state)
   if config == nil and state == nil then
      print('no state table RPROP initializing')
   end
   -- (0) get/update state
   local config = config or {}
   local state = state or config
   local stepsize = config.stepsize or 0.1
   local etaplus = config.etaplus or 1.2
   local etaminus = config.etaminus or 0.5
   local stepsizemax = config.stepsizemax or 50.0
   local stepsizemin = config.stepsizemin or 1E-06
   local niter = config.niter or 1

   local hfx = {}

   for i=1,niter do

      -- (1) evaluate f(x) and df/dx
      local fx,dfdx = opfunc(x)

      -- init temp storage
      if not state.delta then
         state.delta    = dfdx.new(dfdx:size()):zero()
         state.stepsize = dfdx.new(dfdx:size()):fill(stepsize)
         state.sign     = dfdx.new(dfdx:size())
         state.psign    = torch.ByteTensor(dfdx:size())
         state.nsign    = torch.ByteTensor(dfdx:size())
         state.zsign    = torch.ByteTensor(dfdx:size())
         state.dminmax  = torch.ByteTensor(dfdx:size())
         if torch.type(x)=='torch.CudaTensor' then
            -- Push to GPU
            state.psign    = state.psign:cuda()
            state.nsign    = state.nsign:cuda()
            state.zsign    = state.zsign:cuda()
            state.dminmax  = state.dminmax:cuda()
         end
      end

      -- sign of derivative from last step to this one
      torch.cmul(state.sign, dfdx, state.delta)
      torch.sign(state.sign, state.sign)

      -- get indices of >0, <0 and ==0 entries
      state.sign.gt(state.psign, state.sign, 0)
      state.sign.lt(state.nsign, state.sign, 0)
      state.sign.eq(state.zsign, state.sign, 0)

      -- get step size updates
      state.sign[state.psign] = etaplus
      state.sign[state.nsign] = etaminus
      state.sign[state.zsign] = 1

      -- update stepsizes with step size updates
      state.stepsize:cmul(state.sign)

      -- threshold step sizes
      -- >50 => 50
      state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax)
      state.stepsize[state.dminmax] = stepsizemax
      -- <1e-6 ==> 1e-6
      state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin)
      state.stepsize[state.dminmax] = stepsizemin

      -- for dir<0, dfdx=0
      -- for dir>=0 dfdx=dfdx
      dfdx[state.nsign] = 0
      -- state.sign = sign(dfdx)
      torch.sign(state.sign,dfdx)

      -- update weights
      x:addcmul(-1,state.sign,state.stepsize)

      -- update state.dfdx with current dfdx
      state.delta:copy(dfdx)

      table.insert(hfx,fx)
   end

   -- return x*, f(x) before optimization
   return x,hfx
 end
--- a/contrib/torch/optim/sgd.lua
+++ b/contrib/torch/optim/sgd.lua
@@ -0,0 +1,90 @@
 --[[ A plain implementation of SGD

 ARGS:

 - `opfunc` : a function that takes a single input (X), the point
             of a evaluation, and returns f(X) and df/dX
 - `x`      : the initial point
 - `config` : a table with configuration parameters for the optimizer
 - `config.learningRate`      : learning rate
 - `config.learningRateDecay` : learning rate decay
 - `config.weightDecay`       : weight decay
 - `config.weightDecays`      : vector of individual weight decays
 - `config.momentum`          : momentum
 - `config.dampening`         : dampening for momentum
 - `config.nesterov`          : enables Nesterov momentum
 - `config.learningRates`     : vector of individual learning rates
 - `state`  : a table describing the state of the optimizer; after each
             call the state is modified
 - `state.evalCounter`        : evaluation counter (optional: 0, by default)

 RETURN:
 - `x`     : the new x vector
 - `f(x)`  : the function, evaluated before the update

 (Clement Farabet, 2012)
 ]]
 function optim.sgd(opfunc, x, config, state)
   -- (0) get/update state
   local config = config or {}
   local state = state or config
   local lr = config.learningRate or 1e-3
   local lrd = config.learningRateDecay or 0
   local wd = config.weightDecay or 0
   local mom = config.momentum or 0
   local damp = config.dampening or mom
   local nesterov = config.nesterov or false
   local lrs = config.learningRates
   local wds = config.weightDecays
   state.evalCounter = state.evalCounter or 0
   local nevals = state.evalCounter
   assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening")

   -- (1) evaluate f(x) and df/dx
   local fx,dfdx = opfunc(x)

   -- (2) weight decay with single or individual parameters
   if wd ~= 0 then
      dfdx:add(wd, x)
   elseif wds then
      if not state.decayParameters then
         state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
      end
      state.decayParameters:copy(wds):cmul(x)
      dfdx:add(state.decayParameters)
   end

   -- (3) apply momentum
   if mom ~= 0 then
      if not state.dfdx then
         state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx)
      else
         state.dfdx:mul(mom):add(1-damp, dfdx)
      end
      if nesterov then
         dfdx:add(mom, state.dfdx)
      else
         dfdx = state.dfdx
      end
   end

   -- (4) learning rate decay (annealing)
   local clr = lr / (1 + nevals*lrd)

   -- (5) parameter update with single or individual learning rates
   if lrs then
      if not state.deltaParameters then
         state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
      end
      state.deltaParameters:copy(lrs):cmul(dfdx)
      x:add(-clr, state.deltaParameters)
   else
      x:add(-clr, dfdx)
   end

   -- (6) update evaluation counter
   state.evalCounter = state.evalCounter + 1

   -- return x*, f(x) before optimization
   return x,{fx}
 end