[Minor] Move lua contrib libraries to lua- prefix

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-05-23 18:14:15 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-05-23 18:14:15 +0100
commit: 714eb56e1760fdfb26afccde92664d3a2f1e8435 (patch)
tree: 84d1399acbb92f852b4bd64f9ea5412680b0c6ab /contrib/torch/optim
parent: 220a51ff68013dd668a45b78c60a7b8bfc10f074 (diff)
download: rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.tar.gz
rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.zip
22 files changed, 0 insertions, 2784 deletions
diff --git a/contrib/torch/optim/CMakeLists.txt b/contrib/torch/optim/CMakeLists.txt
deleted file mode 100644
index b1c13e701..000000000
--- a/contrib/torch/optim/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-
-CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
-SET(src)
-FILE(GLOB luasrc *.lua)
-ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}")
diff --git a/contrib/torch/optim/COPYRIGHT.txt b/contrib/torch/optim/COPYRIGHT.txt
deleted file mode 100644
index 2e4118c0f..000000000
--- a/contrib/torch/optim/COPYRIGHT.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-Copyright (c) 2011-2013 NYU (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-3. Neither the names of NEC Laboratories American and IDIAP Research
-   Institute nor the names of its contributors may be used to endorse or
-   promote products derived from this software without specific prior
-   written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/torch/optim/ConfusionMatrix.lua b/contrib/torch/optim/ConfusionMatrix.lua
deleted file mode 100644
index ec5302c24..000000000
--- a/contrib/torch/optim/ConfusionMatrix.lua
+++ /dev/null
@@ -1,361 +0,0 @@
---[[ A Confusion Matrix class
-
-Example:
-
-    conf = optim.ConfusionMatrix( {'cat','dog','person'} )   -- new matrix
-    conf:zero()                                              -- reset matrix
-    for i = 1,N do
-        conf:add( neuralnet:forward(sample), label )         -- accumulate errors
-    end
-    print(conf)                                              -- print matrix
-    image.display(conf:render())                             -- render matrix
-]]
-local ConfusionMatrix = torch.class('optim.ConfusionMatrix')
-
-function ConfusionMatrix:__init(nclasses, classes)
-   if type(nclasses) == 'table' then
-      classes = nclasses
-      nclasses = #classes
-   end
-   self.mat = torch.LongTensor(nclasses,nclasses):zero()
-   self.valids = torch.FloatTensor(nclasses):zero()
-   self.unionvalids = torch.FloatTensor(nclasses):zero()
-   self.nclasses = nclasses
-   self.totalValid = 0
-   self.averageValid = 0
-   self.classes = classes or {}
-   -- buffers
-   self._mat_flat = self.mat:view(-1)
-   self._target = torch.FloatTensor()
-   self._prediction = torch.FloatTensor()
-   self._max = torch.FloatTensor()
-   self._pred_idx = torch.LongTensor()
-   self._targ_idx = torch.LongTensor()
-end
-
--- takes scalar prediction and target as input
-function ConfusionMatrix:_add(p, t)
-   assert(p and type(p) == 'number')
-   assert(t and type(t) == 'number')
-   -- non-positive values are considered missing
-   -- and therefore ignored
-   if t > 0 then
-      self.mat[t][p] = self.mat[t][p] + 1
-   end
-end
-
-function ConfusionMatrix:add(prediction, target)
-   if type(prediction) == 'number' then
-      -- comparing numbers
-      self:_add(prediction, target)
-   else
-      self._prediction:resize(prediction:size()):copy(prediction)
-      assert(prediction:dim() == 1)
-      if type(target) == 'number' then
-         -- prediction is a vector, then target assumed to be an index
-         self._max:max(self._pred_idx, self._prediction, 1)
-         self:_add(self._pred_idx[1], target)
-      else
-         -- both prediction and target are vectors
-         assert(target:dim() == 1)
-         self._target:resize(target:size()):copy(target)
-         self._max:max(self._targ_idx, self._target, 1)
-         self._max:max(self._pred_idx, self._prediction, 1)
-         self:_add(self._pred_idx[1], self._targ_idx[1])
-      end
-   end
-end
-
-function ConfusionMatrix:batchAdd(predictions, targets)
-   local preds, targs, __
-   self._prediction:resize(predictions:size()):copy(predictions)
-   if predictions:dim() == 1 then
-      -- predictions is a vector of classes
-      preds = self._prediction
-   elseif predictions:dim() == 2 then
-      -- prediction is a matrix of class likelihoods
-      if predictions:size(2) == 1 then
-         -- or prediction just needs flattening
-         preds = self._prediction:select(2,1)
-      else
-         self._max:max(self._pred_idx, self._prediction, 2)
-         preds = self._pred_idx:select(2,1)
-      end
-   else
-      error("predictions has invalid number of dimensions")
-   end
-
-   self._target:resize(targets:size()):copy(targets)
-   if targets:dim() == 1 then
-      -- targets is a vector of classes
-      targs = self._target
-   elseif targets:dim() == 2 then
-      -- targets is a matrix of one-hot rows
-      if targets:size(2) == 1 then
-         -- or targets just needs flattening
-         targs = self._target:select(2,1)
-      else
-         self._max:max(self._targ_idx, self._target, 2)
-         targs = self._targ_idx:select(2,1)
-      end
-   else
-      error("targets has invalid number of dimensions")
-   end
-
-   -- non-positive values are considered missing and therefore ignored
-   local mask = targs:ge(1)
-   targs = targs[mask]
-   preds = preds[mask]
-
-   self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility
-
-   preds = preds:typeAs(targs)
-
-   assert(self.mat:isContiguous() and self.mat:stride(2) == 1)
-   local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat)
-   local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1))
-   self._mat_flat:indexAdd(1, indices, ones)
-end
-
-function ConfusionMatrix:zero()
-   self.mat:zero()
-   self.valids:zero()
-   self.unionvalids:zero()
-   self.totalValid = 0
-   self.averageValid = 0
-end
-
-local function isNaN(number)
-  return number ~= number
-end
-
-function ConfusionMatrix:updateValids()
-   local total = 0
-   for t = 1,self.nclasses do
-      self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum()
-      self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t])
-      total = total + self.mat[t][t]
-   end
-   self.totalValid = total / self.mat:sum()
-   self.averageValid = 0
-   self.averageUnionValid = 0
-   local nvalids = 0
-   local nunionvalids = 0
-   for t = 1,self.nclasses do
-      if not isNaN(self.valids[t]) then
-         self.averageValid = self.averageValid + self.valids[t]
-         nvalids = nvalids + 1
-      end
-      if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then
-         self.averageUnionValid = self.averageUnionValid + self.unionvalids[t]
-         nunionvalids = nunionvalids + 1
-      end
-   end
-   self.averageValid = self.averageValid / nvalids
-   self.averageUnionValid = self.averageUnionValid / nunionvalids
-end
-
--- Calculating FAR/FRR associated with the confusion matrix
-
-function ConfusionMatrix:farFrr()
-   local cmat = self.mat
-   local noOfClasses = cmat:size()[1]
-   self._frrs = self._frrs or torch.zeros(noOfClasses)
-   self._frrs:zero()
-   self._classFrrs = self._classFrrs or torch.zeros(noOfClasses)
-   self._classFrrs:zero()
-   self._classFrrs:add(-1)
-   self._fars = self._fars or torch.zeros(noOfClasses)
-   self._fars:zero()
-   self._classFars = self._classFars or torch.zeros(noOfClasses)
-   self._classFars:zero()
-   self._classFars:add(-1)
-   local classSamplesCount = cmat:sum(2)
-   local indx = 1
-   for i=1,noOfClasses do
-      if classSamplesCount[i][1] ~= 0 then
-         self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1]
-         self._classFrrs[i] = self._frrs[indx]
-         -- Calculating FARs
-         local farNumerator = 0
-         local farDenominator = 0
-         for j=1, noOfClasses do
-            if i ~= j then
-               if classSamplesCount[j][1] ~= 0 then
-                  farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1]
-                  farDenominator  = farDenominator + 1
-               end
-            end
-         end
-         self._fars[indx] = farNumerator/farDenominator
-         self._classFars[i] = self._fars[indx]
-         indx = indx + 1
-      end
-   end
-   indx = indx - 1
-   local returnFrrs = self._frrs[{{1, indx}}]
-   local returnFars = self._fars[{{1, indx}}]
-   return self._classFrrs, self._classFars, returnFrrs, returnFars
-end
-
-local function log10(n)
-   if math.log10 then
-      return math.log10(n)
-   else
-      return math.log(n) / math.log(10)
-   end
-end
-
-function ConfusionMatrix:__tostring__()
-   self:updateValids()
-   local str = {'ConfusionMatrix:\n'}
-   local nclasses = self.nclasses
-   table.insert(str, '[')
-   local maxCnt = self.mat:max()
-   local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt)))
-   for t = 1,nclasses do
-      local pclass = self.valids[t] * 100
-      pclass = string.format('%2.3f', pclass)
-      if t == 1 then
-         table.insert(str, '[')
-      else
-         table.insert(str, ' [')
-      end
-      for p = 1,nclasses do
-         table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p]))
-      end
-      if self.classes and self.classes[1] then
-         if t == nclasses then
-            table.insert(str, ']]  ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
-         else
-            table.insert(str, ']   ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
-         end
-      else
-         if t == nclasses then
-            table.insert(str, ']]  ' .. pclass .. '% \n')
-         else
-            table.insert(str, ']   ' .. pclass .. '% \n')
-         end
-      end
-   end
-   table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n')
-   table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n')
-   table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%')
-   return table.concat(str)
-end
-
-function ConfusionMatrix:render(sortmode, display, block, legendwidth)
-   -- args
-   local confusion = self.mat:double()
-   local classes = self.classes
-   local sortmode = sortmode or 'score' -- 'score' or 'occurrence'
-   local block = block or 25
-   local legendwidth = legendwidth or 200
-   local display = display or false
-
-   -- legends
-   local legend = {
-      ['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]',
-      ['occurrence'] = 'Confusion matrix [sorted by occurrences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]'
-   }
-
-   -- parse matrix / normalize / count scores
-   local diag = torch.FloatTensor(#classes)
-   local freqs = torch.FloatTensor(#classes)
-   local unconf = confusion
-   local confusion = confusion:clone()
-   local corrects = 0
-   local total = 0
-   for target = 1,#classes do
-      freqs[target] = confusion[target]:sum()
-      corrects = corrects + confusion[target][target]
-      total = total + freqs[target]
-      confusion[target]:div( math.max(confusion[target]:sum(),1) )
-      diag[target] = confusion[target][target]
-   end
-
-   -- accuracies
-   local accuracy = corrects / total * 100
-   local perclass = 0
-   local total = 0
-   for target = 1,#classes do
-      if confusion[target]:sum() > 0 then
-         perclass = perclass + diag[target]
-         total = total + 1
-      end
-   end
-   perclass = perclass / total * 100
-   freqs:div(unconf:sum())
-
-   -- sort matrix
-   if sortmode == 'score' then
-      _,order = torch.sort(diag,1,true)
-   elseif sortmode == 'occurrence' then
-      _,order = torch.sort(freqs,1,true)
-   else
-      error('sort mode must be one of: score | occurrence')
-   end
-
-   -- render matrix
-   local render = torch.zeros(#classes*block, #classes*block)
-   for target = 1,#classes do
-      for prediction = 1,#classes do
-         render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]]
-      end
-   end
-
-   -- add grid
-   for target = 1,#classes do
-      render[{ {target*block},{} }] = 0.1
-      render[{ {},{target*block} }] = 0.1
-   end
-
-   -- create rendering
-   require 'image'
-   require 'qtwidget'
-   require 'qttorch'
-   local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] )
-   image.display{image=render, win=win1}
-
-   -- add legend
-   for i in ipairs(classes) do
-      -- background cell
-      win1:setcolor{r=0,g=0,b=0}
-      win1:rectangle((#render)[2],(i-1)*block,legendwidth,block)
-      win1:fill()
-
-      -- %
-      win1:setfont(qt.QFont{serif=false, size=fontsize})
-      local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4
-      win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2}
-      win1:moveto((#render)[2]+10,i*block-block/3)
-      win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100))
-
-      -- legend
-      win1:setfont(qt.QFont{serif=false, size=fontsize})
-      local gscale = diag[order[i]]*0.8+0.2
-      win1:setcolor{r=gscale,g=gscale,b=gscale}
-      win1:moveto(120+(#render)[2]+10,i*block-block/3)
-      win1:show(classes[order[i]])
-
-      for j in ipairs(classes) do
-         -- scores
-         local score = confusion[order[j]][order[i]]
-         local gscale = (1-score)*(score*0.8+0.2)
-         win1:setcolor{r=gscale,g=gscale,b=gscale}
-         win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3)
-         win1:show(string.format('%02.0f',math.floor(score*100+0.5)))
-      end
-   end
-
-   -- generate tensor
-   local t = win1:image():toTensor()
-
-   -- display
-   if display then
-      image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)}
-   end
-
-   -- return rendering
-   return t
-end
diff --git a/contrib/torch/optim/Logger.lua b/contrib/torch/optim/Logger.lua
deleted file mode 100644
index 3c1da54d3..000000000
--- a/contrib/torch/optim/Logger.lua
+++ /dev/null
@@ -1,189 +0,0 @@
---[[ Logger: a simple class to log symbols during training,
-        and automate plot generation
-
-Example:
-    logger = optim.Logger('somefile.log')    -- file to save stuff
-
-    for i = 1,N do                           -- log some symbols during
-        train_error = ...                     -- training/testing
-        test_error = ...
-        logger:add{['training error'] = train_error,
-            ['test error'] = test_error}
-    end
-
-    logger:style{['training error'] = '-',   -- define styles for plots
-                 ['test error'] = '-'}
-    logger:plot()                            -- and plot
-
----- OR ---
-
-    logger = optim.Logger('somefile.log')    -- file to save stuff
-    logger:setNames{'training error', 'test error'}
-
-    for i = 1,N do                           -- log some symbols during
-       train_error = ...                     -- training/testing
-       test_error = ...
-       logger:add{train_error, test_error}
-    end
-
-    logger:style{'-', '-'}                   -- define styles for plots
-    logger:plot()                            -- and plot
-
------------
-
-    logger:setlogscale(true)                 -- enable logscale on Y-axis
-    logger:plot()                            -- and plot
-]]
-local Logger = torch.class('optim.Logger')
-
-function Logger:__init(filename, timestamp)
-   if filename then
-      self.name = filename
-      os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"')
-      if timestamp then
-         -- append timestamp to create unique log file
-         filename = filename .. '-'..os.date("%Y_%m_%d_%X")
-      end
-      self.file = io.open(filename,'w')
-      self.epsfile = self.name .. '.eps'
-   else
-      self.file = io.stdout
-      self.name = 'stdout'
-      print('<Logger> warning: no path provided, logging to std out')
-   end
-   self.empty = true
-   self.symbols = {}
-   self.styles = {}
-   self.names = {}
-   self.idx = {}
-   self.figure = nil
-   self.showPlot = true
-   self.plotRawCmd = nil
-   self.defaultStyle = '+'
-   self.logscale = false
-end
-
-function Logger:setNames(names)
-   self.names = names
-   self.empty = false
-   self.nsymbols = #names
-   for k,key in pairs(names) do
-      self.file:write(key .. '\t')
-      self.symbols[k] = {}
-      self.styles[k] = {self.defaultStyle}
-      self.idx[key] = k
-   end
-   self.file:write('\n')
-   self.file:flush()
-   return self
-end
-
-function Logger:add(symbols)
-   -- (1) first time ? print symbols' names on first row
-   if self.empty then
-      self.empty = false
-      self.nsymbols = #symbols
-      for k,val in pairs(symbols) do
-         self.file:write(k .. '\t')
-         self.symbols[k] = {}
-         self.styles[k] = {self.defaultStyle}
-         self.names[k] = k
-      end
-      self.idx = self.names
-      self.file:write('\n')
-   end
-   -- (2) print all symbols on one row
-   for k,val in pairs(symbols) do
-      if type(val) == 'number' then
-         self.file:write(string.format('%11.4e',val) .. '\t')
-      elseif type(val) == 'string' then
-         self.file:write(val .. '\t')
-      else
-         xlua.error('can only log numbers and strings', 'Logger')
-      end
-   end
-   self.file:write('\n')
-   self.file:flush()
-   -- (3) save symbols in internal table
-   for k,val in pairs(symbols) do
-      table.insert(self.symbols[k], val)
-   end
-end
-
-function Logger:style(symbols)
-   for name,style in pairs(symbols) do
-      if type(style) == 'string' then
-         self.styles[name] = {style}
-      elseif type(style) == 'table' then
-         self.styles[name] = style
-      else
-         xlua.error('style should be a string or a table of strings','Logger')
-      end
-   end
-   return self
-end
-
-function Logger:setlogscale(state)
-   self.logscale = state
-end
-
-function Logger:display(state)
-   self.showPlot = state
-end
-
-function Logger:plot(...)
-   if not xlua.require('gnuplot') then
-      if not self.warned then
-         print('<Logger> warning: cannot plot with this version of Torch')
-         self.warned = true
-      end
-      return
-   end
-   local plotit = false
-   local plots = {}
-   local plotsymbol =
-      function(name,list)
-         if #list > 1 then
-            local nelts = #list
-            local plot_y = torch.Tensor(nelts)
-            for i = 1,nelts do
-               plot_y[i] = list[i]
-            end
-            for _,style in ipairs(self.styles[name]) do
-               table.insert(plots, {self.names[name], plot_y, style})
-            end
-            plotit = true
-         end
-      end
-   local args = {...}
-   if not args[1] then -- plot all symbols
-      for name,list in pairs(self.symbols) do
-         plotsymbol(name,list)
-      end
-   else -- plot given symbols
-      for _,name in ipairs(args) do
-         plotsymbol(self.idx[name], self.symbols[self.idx[name]])
-      end
-   end
-   if plotit then
-      if self.showPlot then
-         self.figure = gnuplot.figure(self.figure)
-         if self.logscale then gnuplot.logscale('on') end
-         gnuplot.plot(plots)
-         if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
-         gnuplot.grid('on')
-         gnuplot.title('<Logger::' .. self.name .. '>')
-      end
-      if self.epsfile then
-         os.execute('rm -f "' .. self.epsfile .. '"')
-         local epsfig = gnuplot.epsfigure(self.epsfile)
-         if self.logscale then gnuplot.logscale('on') end
-         gnuplot.plot(plots)
-         if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
-         gnuplot.grid('on')
-         gnuplot.title('<Logger::' .. self.name .. '>')
-         gnuplot.plotflush()
-         gnuplot.close(epsfig)
-      end
-   end
-end
diff --git a/contrib/torch/optim/adadelta.lua b/contrib/torch/optim/adadelta.lua
deleted file mode 100644
index 7cc058d29..000000000
--- a/contrib/torch/optim/adadelta.lua
+++ /dev/null
@@ -1,55 +0,0 @@
---[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701
-
-ARGS:
-- `opfunc` : a function that takes a single input (X), the point of
-            evaluation, and returns f(X) and df/dX
-- `x` : the initial point
-- `config` : a table of hyper-parameters
-- `config.rho` : interpolation parameter
-- `config.eps` : for numerical stability
-- `config.weightDecay` : weight decay
-- `state` : a table describing the state of the optimizer; after each
-         call the state is modified
-- `state.paramVariance` : vector of temporal variances of parameters
-- `state.accDelta` : vector of accummulated delta of gradients
-RETURN:
-- `x` : the new x vector
-- `f(x)` : the function, evaluated before the update
-]]
-function optim.adadelta(opfunc, x, config, state)
-    -- (0) get/update state
-    if config == nil and state == nil then
-        print('no state table, ADADELTA initializing')
-    end
-    local config = config or {}
-    local state = state or config
-    local rho = config.rho or 0.9
-    local eps = config.eps or 1e-6
-    local wd = config.weightDecay or 0
-    state.evalCounter = state.evalCounter or 0
-    -- (1) evaluate f(x) and df/dx
-    local fx,dfdx = opfunc(x)
-
-    -- (2) weight decay
-    if wd ~= 0 then
-      dfdx:add(wd, x)
-    end
-
-    -- (3) parameter update
-    if not state.paramVariance then
-        state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
-        state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
-        state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
-        state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
-    end
-    state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx)
-    state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt()
-    state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx)
-    x:add(-1, state.delta)
-    state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta)
-    -- (4) update evaluation counter
-    state.evalCounter = state.evalCounter + 1
-
-    -- return x*, f(x) before optimization
-    return x,{fx}
-end
diff --git a/contrib/torch/optim/adagrad.lua b/contrib/torch/optim/adagrad.lua
deleted file mode 100644
index 6860c4317..000000000
--- a/contrib/torch/optim/adagrad.lua
+++ /dev/null
@@ -1,55 +0,0 @@
---[[ ADAGRAD implementation for SGD
-
-ARGS:
-- `opfunc` : a function that takes a single input (X), the point of
-         evaluation, and returns f(X) and df/dX
-- `x` : the initial point
-- `state` : a table describing the state of the optimizer; after each
-         call the state is modified
-- `state.learningRate` : learning rate
-- `state.paramVariance` : vector of temporal variances of parameters
-- `state.weightDecay` : scalar that controls weight decay
-RETURN:
-- `x` : the new x vector
-- `f(x)` : the function, evaluated before the update
-
-]]
-function optim.adagrad(opfunc, x, config, state)
-   -- (0) get/update state
-   if config == nil and state == nil then
-      print('no state table, ADAGRAD initializing')
-   end
-   local config = config or {}
-   local state = state or config
-   local lr = config.learningRate or 1e-3
-   local lrd = config.learningRateDecay or 0
-   local wd = config.weightDecay or 0
-   state.evalCounter = state.evalCounter or 0
-   local nevals = state.evalCounter
-
-   -- (1) evaluate f(x) and df/dx
-   local fx,dfdx = opfunc(x)
-
-   -- (2) weight decay with a single parameter
-   if wd ~= 0 then
-       dfdx:add(wd, x)
-   end
-
-   -- (3) learning rate decay (annealing)
-   local clr = lr / (1 + nevals*lrd)
-
-   -- (4) parameter update with single or individual learning rates
-   if not state.paramVariance then
-      state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
-      state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx)
-   end
-   state.paramVariance:addcmul(1,dfdx,dfdx)
-   state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt()
-   x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10))
-
-   -- (5) update evaluation counter
-   state.evalCounter = state.evalCounter + 1
-
-   -- return x*, f(x) before optimization
-   return x,{fx}
-end
diff --git a/contrib/torch/optim/adam.lua b/contrib/torch/optim/adam.lua
deleted file mode 100644
index 2e127e96a..000000000
--- a/contrib/torch/optim/adam.lua
+++ /dev/null
@@ -1,72 +0,0 @@
---[[ An implementation of Adam https://arxiv.org/abs/1412.6980
-
-ARGS:
-
-- 'opfunc' : a function that takes a single input (X), the point
-             of a evaluation, and returns f(X) and df/dX
-- 'x'      : the initial point
-- 'config` : a table with configuration parameters for the optimizer
-- 'config.learningRate'      : learning rate
-- `config.learningRateDecay` : learning rate decay
-- 'config.beta1'             : first moment coefficient
-- 'config.beta2'             : second moment coefficient
-- 'config.epsilon'           : for numerical stability
-- 'config.weightDecay'       : weight decay
-- 'state'                    : a table describing the state of the optimizer; after each
-                              call the state is modified
-
-RETURN:
-- `x`     : the new x vector
-- `f(x)`  : the function, evaluated before the update
-
-]]
-
-function optim.adam(opfunc, x, config, state)
-   -- (0) get/update state
-   local config = config or {}
-   local state = state or config
-   local lr = config.learningRate or 0.001
-   local lrd = config.learningRateDecay or 0
-
-   local beta1 = config.beta1 or 0.9
-   local beta2 = config.beta2 or 0.999
-   local epsilon = config.epsilon or 1e-8
-   local wd = config.weightDecay or 0
-
-   -- (1) evaluate f(x) and df/dx
-   local fx, dfdx = opfunc(x)
-
-   -- (2) weight decay
-   if wd ~= 0 then
-      dfdx:add(wd, x)
-   end
-
-   -- Initialization
-   state.t = state.t or 0
-   -- Exponential moving average of gradient values
-   state.m = state.m or x.new(dfdx:size()):zero()
-   -- Exponential moving average of squared gradient values
-   state.v = state.v or x.new(dfdx:size()):zero()
-   -- A tmp tensor to hold the sqrt(v) + epsilon
-   state.denom = state.denom or x.new(dfdx:size()):zero()
-
-   -- (3) learning rate decay (annealing)
-   local clr = lr / (1 + state.t*lrd)
-
-   state.t = state.t + 1
-
-   -- Decay the first and second moment running average coefficient
-   state.m:mul(beta1):add(1-beta1, dfdx)
-   state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)
-
-   state.denom:copy(state.v):sqrt():add(epsilon)
-
-   local biasCorrection1 = 1 - beta1^state.t
-   local biasCorrection2 = 1 - beta2^state.t
-   local stepSize = clr * math.sqrt(biasCorrection2)/biasCorrection1
-   -- (4) update x
-   x:addcdiv(-stepSize, state.m, state.denom)
-
-   -- return x*, f(x) before optimization
-   return x, {fx}
-end
diff --git a/contrib/torch/optim/adamax.lua b/contrib/torch/optim/adamax.lua
deleted file mode 100644
index 2b6487720..000000000
--- a/contrib/torch/optim/adamax.lua
+++ /dev/null
@@ -1,66 +0,0 @@
---[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf
-
-ARGS:
-
-- 'opfunc' : a function that takes a single input (X), the point
-             of a evaluation, and returns f(X) and df/dX
-- 'x'      : the initial point
-- 'config` : a table with configuration parameters for the optimizer
-- 'config.learningRate'      : learning rate
-- 'config.beta1'             : first moment coefficient
-- 'config.beta2'             : second moment coefficient
-- 'config.epsilon'           : for numerical stability
-- 'state'                    : a table describing the state of the optimizer;
-                               after each call the state is modified.
-
-RETURN:
-- `x`     : the new x vector
-- `f(x)`  : the function, evaluated before the update
-
-]]
-
-function optim.adamax(opfunc, x, config, state)
-   -- (0) get/update state
-   local config = config or {}
-   local state = state or config
-   local lr = config.learningRate or 0.002
-
-   local beta1 = config.beta1 or 0.9
-   local beta2 = config.beta2 or 0.999
-   local epsilon = config.epsilon or 1e-38
-   local wd = config.weightDecay or 0
-
-   -- (1) evaluate f(x) and df/dx
-   local fx, dfdx = opfunc(x)
-
-   -- (2) weight decay
-   if wd ~= 0 then
-      dfdx:add(wd, x)
-   end
-
-   -- Initialization
-   state.t = state.t or 0
-   -- Exponential moving average of gradient values
-   state.m = state.m or x.new(dfdx:size()):zero()
-   -- Exponential moving average of the infinity norm
-   state.u = state.u or x.new(dfdx:size()):zero()
-   -- A tmp tensor to hold the input to max()
-   state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero()
-
-   state.t = state.t + 1
-
-   -- Update biased first moment estimate.
-   state.m:mul(beta1):add(1-beta1, dfdx)
-   -- Update the exponentially weighted infinity norm.
-   state.max[1]:copy(state.u):mul(beta2)
-   state.max[2]:copy(dfdx):abs():add(epsilon)
-   state.u:max(state.max, 1)
-
-   local biasCorrection1 = 1 - beta1^state.t
-   local stepSize = lr/biasCorrection1
-   -- (2) update x
-   x:addcdiv(-stepSize, state.m, state.u)
-
-   -- return x*, f(x) before optimization
-   return x, {fx}
-end
diff --git a/contrib/torch/optim/asgd.lua b/contrib/torch/optim/asgd.lua
deleted file mode 100644
index cc1c459f3..000000000
--- a/contrib/torch/optim/asgd.lua
+++ /dev/null
@@ -1,73 +0,0 @@
---[[ An implementation of ASGD
-
-ASGD:
-
-       x := (1 - lambda eta_t) x - eta_t df/dx(z,x)
-       a := a + mu_t [ x - a ]
-
-    eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
-     mu_t = 1/max(1,t-t0)
-
-implements ASGD algoritm as in L.Bottou's sgd-2.0
-
-ARGS:
-
-- `opfunc` : a function that takes a single input (X), the point of
-         evaluation, and returns f(X) and df/dX
-- `x`      : the initial point
-- `state`  : a table describing the state of the optimizer; after each
-         call the state is modified
-- `state.eta0`   : learning rate
-- `state.lambda` : decay term
-- `state.alpha`  : power for eta update
-- `state.t0`     : point at which to start averaging
-
-RETURN:
-- `x`     : the new x vector
-- `f(x)`  : the function, evaluated before the update
-- `ax`    : the averaged x vector
-
-(Clement Farabet, 2012)
---]]
-function optim.asgd(opfunc, x, config, state)
-   -- (0) get/update state
-   local config = config or {}
-   local state = state or config
-   config.eta0 = config.eta0 or 1e-4
-   config.lambda = config.lambda or 1e-4
-   config.alpha = config.alpha or 0.75
-   config.t0 = config.t0 or 1e6
-
-   -- (hidden state)
-   state.eta_t = state.eta_t or config.eta0
-   state.mu_t = state.mu_t or 1
-   state.t = state.t or 0
-
-   -- (1) evaluate f(x) and df/dx
-   local fx,dfdx = opfunc(x)
-
-   -- (2) decay term
-   x:mul(1 - config.lambda*state.eta_t)
-
-   -- (3) update x
-   x:add(-state.eta_t, dfdx)
-
-   -- (4) averaging
-   state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero()
-   state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax)
-   if state.mu_t ~= 1 then
-      state.tmp:copy(x)
-      state.tmp:add(-1,state.ax):mul(state.mu_t)
-      state.ax:add(state.tmp)
-   else
-      state.ax:copy(x)
-   end
-
-   -- (5) update eta_t and mu_t
-   state.t = state.t + 1
-   state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha)
-   state.mu_t = 1 / math.max(1, state.t - config.t0)
-
-   -- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...)
-   return x,{fx},state.ax
-end
diff --git a/contrib/torch/optim/cg.lua b/contrib/torch/optim/cg.lua
deleted file mode 100644
index 842a7d569..000000000
--- a/contrib/torch/optim/cg.lua
+++ /dev/null
@@ -1,208 +0,0 @@
---[[
-
-This cg implementation is a rewrite of minimize.m written by Carl
-E. Rasmussen. It is supposed to produce exactly same results (give
-or take numerical accuracy due to some changed order of
-operations). You can compare the result on rosenbrock with minimize.m.
-http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html
-
-    [x fx c] = minimize([0 0]', 'rosenbrock', -25)
-
-Note that we limit the number of function evaluations only, it seems much
-more important in practical use.
-
-ARGS:
-
-- `opfunc` : a function that takes a single input, the point of evaluation.
-- `x`      : the initial point
-- `state` : a table of parameters and temporary allocations.
-- `state.maxEval`     : max number of function evaluations
-- `state.maxIter`     : max number of iterations
-- `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage
-- `state.[s,x0]`      : if you pass torch.Tensor they will be used for temp storage
-
-RETURN:
-
-- `x*` : the new x vector, at the optimal point
-- `f`  : a table of all function values where
-     `f[1]` is the value of the function before any optimization and
-     `f[#f]` is the final fully optimized value, at x*
-
-(Koray Kavukcuoglu, 2012)
---]]
-function optim.cg(opfunc, x, config, state)
-   -- parameters
-   local config = config or {}
-   local state = state or config
-   local rho  = config.rho or 0.01
-   local sig  = config.sig or 0.5
-   local int  = config.int or 0.1
-   local ext  = config.ext or 3.0
-   local maxIter  = config.maxIter or 20
-   local ratio = config.ratio or 100
-   local maxEval = config.maxEval or maxIter*1.25
-   local red = 1
-
-   local verbose = config.verbose or 0
-
-   local i = 0
-   local ls_failed = 0
-   local fx  = {}
-
-   -- we need three points for the interpolation/extrapolation stuff
-   local z1,z2,z3 = 0,0,0
-   local d1,d2,d3 = 0,0,0
-   local f1,f2,f3 = 0,0,0
-
-   local df1 = state.df1 or x.new()
-   local df2 = state.df2 or x.new()
-   local df3 = state.df3 or x.new()
-   local tdf
-
-   df1:resizeAs(x)
-   df2:resizeAs(x)
-   df3:resizeAs(x)
-
-   -- search direction
-   local s = state.s or x.new()
-   s:resizeAs(x)
-
-   -- we need a temp storage for X
-   local x0 = state.x0 or x.new()
-   local f0 = 0
-   local df0 = state.df0 or x.new()
-   x0:resizeAs(x)
-   df0:resizeAs(x)
-
-   -- evaluate at initial point
-   f1,tdf = opfunc(x)
-   fx[#fx+1] = f1
-   df1:copy(tdf)
-   i=i+1
-
-   -- initial search direction
-   s:copy(df1):mul(-1)
-
-   d1 = -s:dot(s )         -- slope
-   z1 = red/(1-d1)         -- initial step
-
-   while i < math.abs(maxEval) do
-
-      x0:copy(x)
-      f0 = f1
-      df0:copy(df1)
-
-      x:add(z1,s)
-      f2,tdf = opfunc(x)
-      df2:copy(tdf)
-      i=i+1
-      d2 = df2:dot(s)
-      f3,d3,z3 = f1,d1,-z1   -- init point 3 equal to point 1
-      local m = math.min(maxIter,maxEval-i)
-      local success = 0
-      local limit = -1
-
-      while true do
-         while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do
-            limit = z1
-            if f2 > f1 then
-               z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3)
-            else
-               local A = 6*(f2-f3)/z3+3*(d2+d3)
-               local B = 3*(f3-f2)-z3*(d3+2*d2)
-               z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A
-            end
-            if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then
-               z2 = z3/2;
-            end
-            z2 = math.max(math.min(z2, int*z3),(1-int)*z3);
-            z1 = z1 + z2;
-            x:add(z2,s)
-            f2,tdf = opfunc(x)
-            df2:copy(tdf)
-            i=i+1
-            m = m - 1
-            d2 = df2:dot(s)
-            z3 = z3-z2;
-         end
-         if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then
-            break
-         elseif d2 > sig*d1 then
-            success = 1;
-            break;
-         elseif m == 0 then
-            break;
-         end
-         local A = 6*(f2-f3)/z3+3*(d2+d3);
-         local B = 3*(f3-f2)-z3*(d3+2*d2);
-         z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3))
-
-         if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then
-            if limit < -0.5 then
-               z2 = z1 * (ext -1)
-            else
-               z2 = (limit-z1)/2
-            end
-         elseif (limit > -0.5) and (z2+z1) > limit then
-            z2 = (limit-z1)/2
-         elseif limit < -0.5 and (z2+z1) > z1*ext then
-            z2 = z1*(ext-1)
-         elseif z2 < -z3*int then
-            z2 = -z3*int
-         elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then
-            z2 = (limit-z1)*(1-int)
-         end
-         f3=f2; d3=d2; z3=-z2;
-         z1 = z1+z2;
-         x:add(z2,s)
-
-         f2,tdf = opfunc(x)
-         df2:copy(tdf)
-         i=i+1
-         m = m - 1
-         d2 = df2:dot(s)
-      end
-      if success == 1 then
-         f1 = f2
-         fx[#fx+1] = f1;
-         local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1)
-         s:mul(ss)
-         s:add(-1,df2)
-         local tmp = df1:clone()
-         df1:copy(df2)
-         df2:copy(tmp)
-         d2 = df1:dot(s)
-         if d2> 0 then
-            s:copy(df1)
-            s:mul(-1)
-            d2 = -s:dot(s)
-         end
-
-         z1 = z1 * math.min(ratio, d1/(d2-1e-320))
-         d1 = d2
-         ls_failed = 0
-      else
-         x:copy(x0)
-         f1 = f0
-         df1:copy(df0)
-         if ls_failed or i>maxEval then
-            break
-         end
-         local tmp = df1:clone()
-         df1:copy(df2)
-         df2:copy(tmp)
-         s:copy(df1)
-         s:mul(-1)
-         d1 = -s:dot(s)
-         z1 = 1/(1-d1)
-         ls_failed = 1
-      end
-   end
-   state.df0 = df0
-   state.df1 = df1
-   state.df2 = df2
-   state.df3 = df3
-   state.x0 = x0
-   state.s = s
-   return x,fx,i
-end
diff --git a/contrib/torch/optim/checkgrad.lua b/contrib/torch/optim/checkgrad.lua
deleted file mode 100644
index 0382b2132..000000000
--- a/contrib/torch/optim/checkgrad.lua
+++ /dev/null
@@ -1,52 +0,0 @@
---[[ An implementation of a simple numerical gradient checker.
-
-ARGS:
-
-- `opfunc` : a function that takes a single input (X), the point of
-         evaluation, and returns f(X) and df/dX
-- `x` : the initial point
-- `eps` : the epsilon to use for the numerical check (default is 1e-7)
-
-RETURN:
-
-- `diff` : error in the gradient, should be near tol
-- `dC` : exact gradient at point
-- `dC_est` : numerically estimates gradient at point
-
-]]--
-
-
--- function that numerically checks gradient of NCA loss:
-function optim.checkgrad(opfunc, x, eps)
-
-    -- compute true gradient:
-    local Corg,dC = opfunc(x)
-    dC:resize(x:size())
-
-    local Ctmp -- temporary value
-    local isTensor = torch.isTensor(Corg)
-    if isTensor then
-          Ctmp = Corg.new(Corg:size())
-    end
-
-    -- compute numeric approximations to gradient:
-    local eps = eps or 1e-7
-    local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC)
-    for i = 1,dC:size(1) do
-      local tmp = x[i]
-      x[i] = x[i] + eps
-      local C1 = opfunc(x)
-      if isTensor then
-          Ctmp:copy(C1)
-          C1 = Ctmp
-      end
-      x[i] = x[i] - 2 * eps
-      local C2 = opfunc(x)
-      x[i] = tmp
-      dC_est[i] = (C1 - C2) / (2 * eps)
-    end
-
-    -- estimate error of gradient:
-    local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est)
-    return diff,dC,dC_est
-end
diff --git a/contrib/torch/optim/cmaes.lua b/contrib/torch/optim/cmaes.lua
deleted file mode 100644
index 74cd58a0c..000000000
--- a/contrib/torch/optim/cmaes.lua
+++ /dev/null
@@ -1,270 +0,0 @@
-require 'torch'
-require 'math'
-
-local BestSolution = {}
---[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy),
-ported from https://www.lri.fr/~hansen/barecmaes2.html.
-
-Parameters
-----------
-ARGS:
-
--    `opfunc` : a function that takes a single input (X), the point of
-               evaluation, and returns f(X) and df/dX. Note that df/dX is not used
--    `x` : the initial point
--    `state.sigma`
-            float, initial step-size (standard deviation in each
-            coordinate)
--    `state.maxEval`
-            int, maximal number of function evaluations
--    `state.ftarget`
-            float, target function value
--    `state.popsize`
-          population size. If this is left empty,
-            4 + int(3 * log(|x|)) will be used
--    `state.ftarget`
-            stop if fitness < ftarget
--    `state.verb_disp`
-            int, display on console every verb_disp iteration, 0 for never
-
-RETURN:
-- `x*` : the new `x` vector, at the optimal point
-- `f`  : a table of all function values:
-       `f[1]` is the value of the function before any optimization and
-       `f[#f]` is the final fully optimized value, at `x*`
---]]
-function optim.cmaes(opfunc, x, config, state)
-   if  (x.triu == nil or x.diag == nil) then
-      error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x")
-   end
-   -- process input parameters
-   local config = config or {}
-   local state = state or config
-   local xmean = x:clone():view(-1) -- distribution mean, a flattened copy
-   local N = xmean:size(1)  -- number of objective variables/problem dimension
-   local sigma = state.sigma -- coordinate wise standard deviation (step size)
-   local ftarget = state.ftarget -- stop if fitness < ftarget
-   local maxEval = tonumber(state.maxEval) or 1e3*N^2
-   local objfunc = opfunc
-   local verb_disp = state.verb_disp -- display step size
-   local min_iterations = state.min_iterations or 1
-
-   local lambda = state.popsize -- population size, offspring number
-   -- Strategy parameter setting: Selection
-   if state.popsize == nil then
-      lambda = 4 + math.floor(3 * math.log(N))
-   end
-
-   local mu = lambda / 2  -- number of parents/points for recombination
-   local weights = torch.range(0,mu-1):apply(function(i)
-       return math.log(mu+0.5) - math.log(i+1)  end) -- recombination weights
-   weights:div(weights:sum())  -- normalize recombination weights array
-   local mueff = weights:sum()^2 / torch.pow(weights,2):sum()  -- variance-effectiveness of sum w_i x_i
-   weights = weights:typeAs(x)
-
-   -- Strategy parameter setting: Adaptation
-   local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N)  -- time constant for cumulation for C
-   local cs = (mueff + 2) / (N + mueff + 5)  -- t-const for cumulation for sigma control
-   local c1 = 2 / ((N + 1.3)^2 + mueff)  -- learning rate for rank-one update of C
-   local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff))  -- and for rank-mu update
-   local damps = 2 * mueff/lambda + 0.3 + cs  -- damping for sigma, usually close to 1
-
-   -- Initialize dynamic (internal) state variables
-   local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C
-   local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma
-   local B = torch.eye(N):typeAs(x)   -- B defines the coordinate system
-   local D = torch.Tensor(N):fill(1):typeAs(x)  -- diagonal D defines the scaling
-   local C = torch.eye(N):typeAs(x)   -- covariance matrix
-   if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig
-      error('torch.symeig not available for ' .. x:type() ..
-         " please use Float- or DoubleTensor for x")
-   end
-   local candidates = torch.Tensor(lambda,N):typeAs(x)
-   local invsqrtC = torch.eye(N):typeAs(x)  -- C^-1/2
-   local eigeneval = 0      -- tracking the update of B and D
-   local counteval = 0
-   local f_hist = {[1]=opfunc(x)}  -- for bookkeeping output and termination
-   local fitvals = torch.Tensor(lambda)-- fitness values
-   local best = BestSolution.new(nil,nil,counteval)
-   local iteration = 0 -- iteration of the optimize loop
-
-
-   local function ask()
-      --[[return a list of lambda candidate solutions according to
-       m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I)
-       --]]
-      -- Eigendecomposition: first update B, D and invsqrtC from C
-      -- postpone in case to achieve O(N^2)
-      if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then
-         eigeneval = counteval
-         C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry
-         D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3)
-         D = torch.sqrt(D)  -- D contains standard deviations now
-         invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t())
-      end
-      for k=1,lambda do --repeat lambda times
-         local z = D:clone():normal(0,1):cmul(D)
-         candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma)
-      end
-
-      return candidates
-   end
-
-
-   local function tell(arx)
-      --[[update the evolution paths and the distribution parameters m,
-      sigma, and C within CMA-ES.
-
-      Parameters
-      ----------
-            `arx`
-                  a list of solutions, presumably from `ask()`
-            `fitvals`
-                  the corresponding objective function values --]]
-      -- bookkeeping, preparation
-      counteval = counteval + lambda  -- slightly artificial to do here
-      local xold = xmean:clone()
-
-      -- Sort by fitness and compute weighted mean into xmean
-      local arindex = nil --sorted indices
-      fitvals, arindex = torch.sort(fitvals)
-      arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions
-
-      table.insert(f_hist, fitvals[1]) --append best fitness to history
-      best:update(arx[1], fitvals[1], counteval)
-
-      xmean:zero()
-      xmean:addmv(arx:t(), weights) --dot product
-
-      -- Cumulation: update evolution paths
-      local y = xmean - xold
-      local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold)
-
-      local c = (cs * (2-cs) * mueff)^0.5 / sigma
-      ps = ps - ps * cs + z * c -- exponential decay on ps
-      local hsig = (torch.sum(torch.pow(ps,2)) /
-         (1-(1-cs)^(2*counteval/lambda)) / N  < 2 + 4./(N+1))
-      hsig = hsig and 1.0 or 0.0 --use binary numbers
-
-      c = (cc * (2-cc) * mueff)^0.5 / sigma
-      pc = pc - pc * cc + y * c * hsig -- exponential decay on pc
-
-      -- Adapt covariance matrix C
-      local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc)
-      -- for a minor adjustment to the variance loss by hsig
-      for i=1,N do
-         for j=1,N do
-            local r = torch.range(1,mu)
-            r:apply(function(k)
-               return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end)
-            local Cmuij = torch.sum(r) / sigma^2  -- rank-mu update
-            C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] +
-                  c1 * pc[i]*pc[j] + cmu * Cmuij)
-            end
-         end
-
-         -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82
-         sigma = sigma * math.exp(math.min(0.6,
-               (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2))
-   end
-
-   local function stop()
-      --[[return satisfied termination conditions in a table like
-      {'termination reason':value, ...}, for example {'tolfun':1e-12},
-      or the empty table {}--]]
-      local res = {}
-      if counteval > 0 then
-         if counteval >= maxEval then
-            res['evals'] = maxEval
-         end
-         if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then
-            res['ftarget'] = ftarget
-         end
-         if torch.max(D) > 1e7 * torch.min(D) then
-            res['condition'] = 1e7
-         end
-         if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then
-            res['tolfun'] = 1e-12
-         end
-         if sigma * torch.max(D) < 1e-11 then
-            -- remark: max(D) >= max(diag(C))^0.5
-            res['tolx'] = 1e-11
-         end
-      end
-      return res
-   end
-
-   local function disp(verb_modulo)
-      --[[display some iteration info--]]
-      if verb_disp == 0 then
-         return nil
-      end
-      local iteration = counteval / lambda
-
-      if iteration == 1 or iteration % (10*verb_modulo) == 0 then
-         print('evals:\t ax-ratio max(std)   f-value')
-      end
-      if iteration <= 2 or iteration % verb_modulo == 0 then
-         local max_std = math.sqrt(torch.max(torch.diag(C)))
-         print(tostring(counteval).. ': ' ..
-            string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std)
-            .. tostring(fitvals[1]))
-      end
-
-      return nil
-   end
-
-   while next(stop()) == nil or iteration < min_iterations do
-      iteration = iteration + 1
-
-      local X = ask() -- deliver candidate solutions
-      for i=1, lambda do
-         -- put candidate tensor back in input shape and evaluate in opfunc
-         local candidate = X[i]:viewAs(x)
-         fitvals[i] = objfunc(candidate)
-      end
-
-      tell(X)
-      disp(verb_disp)
-   end
-
-   local bestmu, f, c = best:get()
-   if verb_disp > 0 then
-      for k, v in pairs(stop()) do
-         print('termination by', k, '=', v)
-      end
-      print('best f-value =', f)
-      print('solution = ')
-      print(bestmu)
-      print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration)
-   end
-   table.insert(f_hist, f)
-
-   return bestmu, f_hist, counteval
-end
-
-
-
-BestSolution.__index = BestSolution
-function BestSolution.new(x, f, evals)
-   local self = setmetatable({}, BestSolution)
-   self.x = x
-   self.f = f
-   self.evals = evals
-   return self
-end
-
-function BestSolution.update(self, arx, arf, evals)
-   --[[initialize the best solution with `x`, `f`, and `evals`.
-      Better solutions have smaller `f`-values.--]]
-   if self.f == nil or arf < self.f then
-      self.x = arx:clone()
-      self.f = arf
-      self.evals = evals
-   end
-   return self
-end
-
-function BestSolution.get(self)
-   return self.x, self.f, self.evals
-end
diff --git a/contrib/torch/optim/de.lua b/contrib/torch/optim/de.lua
deleted file mode 100644
index 1e8e8001d..000000000
--- a/contrib/torch/optim/de.lua
+++ /dev/null
@@ -1,109 +0,0 @@
---[[ An implementation of `DE` (Differential Evolution),
-
-ARGS:
-
-    -`opfunc` : a function that takes a single input (X), the point of
-    evaluation, and returns f(X) and df/dX. Note that df/dX is not used
-    -`x` : 		the initial point
-    -`state.popsize`: 			population size. If this is left empty, 10*d will be used
-    -`state.scaleFactor`: 		float, usually between 0.4 and 1
-    -`state.crossoverRate`:		float, usually between 0.1 and 0.9
-    -`state.maxEval`:			int, maximal number of function evaluations
-
-RETURN:
-    - `x*` : the new `x` vector, at the optimal point
-    - `f`  : a table of all function values:
-    `f[1]` is the value of the function before any optimization and
-    `f[#f]` is the final fully optimized value, at `x*`
-]]
-
-require 'torch'
-
-function optim.de(opfunc, x, config, state)
-    -- process input parameters
-    local config = config or {}
-    local state = state
-    local popsize = config.popsize			-- population size
-    local scaleFactor = config.scaleFactor	 	-- scale factor
-    local crossoverRate = config.crossoverRate	-- crossover rate
-    local maxFEs = tonumber(config.maxFEs)		-- maximal number of function evaluations
-    local maxRegion = config.maxRegion	        -- upper bound of search region
-    local minRegion = config.minRegion		-- lower bound of search region
-    local xmean = x:clone():view(-1) 		-- distribution mean, a flattened copy
-    local D = xmean:size(1)  			-- number of objective variables/problem dimension
-
-    if config.popsize == nil then
-	popsize = 10 * D
-    end
-    if config.maxRegion == nil then
-	maxRegion = 30
-    end
-    if config.minRegion == nil then
-	minRegion = -30
-    end
-
-    -- Initialize population
-    local fx = x.new(maxFEs)
-    local pop = x.new(popsize, D)
-    local children = x.new(popsize, D)
-    local fitness = x.new(popsize)
-    local children_fitness = x.new(popsize)
-    local fes = 1	-- number of function evaluations
-    local best_fitness
-    local best_solution = x.new(D)
-
-    -- Initialize population and evaluate the its fitness value
-    local gen = torch.Generator()
-    torch.manualSeed(gen, 1)
-
-    pop:uniform(gen, minRegion, maxRegion)
-    for i = 1, popsize do
-	fitness[i] = opfunc(pop[i])
-	fx[fes] = fitness[i]
-	fes = fes + 1
-    end
-
-    -- Find the best solution
-    local index
-    best_fitness, index = fitness:max(1)
-    best_fitness = best_fitness[1]
-    index = index[1]
-    best_solution:copy(pop[index])
-
-    -- Main loop
-    while fes < maxFEs do
-	local  r1, r2
-	for i = 1, popsize do
-	    repeat
-		r1 = torch.random(gen, 1, popsize)
-	    until(r1 ~= i)
-	    repeat
-		r2 = torch.random(gen, 1, popsize)
-	    until(r2 ~= r1 and r2 ~= i)
-
-	    local jrand = torch.random(gen, 1, D)
-	    for j = 1, D do
-		if torch.uniform(gen, 0, 1) < crossoverRate or i == jrand then
-		    children[i][j] = best_solution[j] + scaleFactor * (pop[r1][j] - pop[r2][j])
-		else
-		    children[i][j] = pop[i][j]
-		end
-	    end
-	    children_fitness[i] = opfunc(children[i])
-	    fx[fes] = children_fitness[i]
-	    fes = fes + 1
-	end
-
-	for i = 1, popsize do
-	    if children_fitness[i] <= fitness[i] then
-		pop[i]:copy(children[i])
-		fitness[i] = children_fitness[i]
-		if fitness[i] < best_fitness then
-		    best_fitness = fitness[i]
-		    best_solution:copy(children[i])
-		end
-	    end
-	end
-    end
-    return best_solution, fx
-end
diff --git a/contrib/torch/optim/fista.lua b/contrib/torch/optim/fista.lua
deleted file mode 100644
index c8c6f5e43..000000000
--- a/contrib/torch/optim/fista.lua
+++ /dev/null
@@ -1,192 +0,0 @@
---[[ FISTA with backtracking line search
-
-- `f`        : smooth function
-- `g`        : non-smooth function
-- `pl`       : minimizer of intermediate problem Q(x,y)
-- `xinit`    : initial point
-- `params`   : table of parameters (**optional**)
-- `params.L`       : 1/(step size) for ISTA/FISTA iteration (0.1)
-- `params.Lstep`   : step size multiplier at each iteration (1.5)
-- `params.maxiter` : max number of iterations (50)
-- `params.maxline` : max number of line search iterations per iteration (20)
-- `params.errthres`: Error thershold for convergence check (1e-4)
-- `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true)
-- `params.verbose` : store each iteration solution and print detailed info (false)
-
-On output, `params` will contain these additional fields that can be reused.
-
-- `params.L`       : last used L value will be written.
-
-These are temporary storages needed by the algo and if the same params object is
-passed a second time, these same storages will be used without new allocation.
-
-- `params.xkm`     : previous iterarion point
-- `params.y`       : fista iteration
-- `params.ply`     : ply = pl(y - 1/L grad(f))
-
-Returns the solution x and history of {function evals, number of line search ,...}
-
-Algorithm is published in
-
-    @article{beck-fista-09,
-       Author = {Beck, Amir and Teboulle, Marc},
-       Journal = {SIAM J. Img. Sci.},
-       Number = {1},
-       Pages = {183--202},
-       Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems},
-       Volume = {2},
-       Year = {2009}}
-]]
-function optim.FistaLS(f, g, pl, xinit, params)
-
-   local params = params or {}
-   local L = params.L or 0.1
-   local Lstep = params.Lstep or 1.5
-   local maxiter = params.maxiter or 50
-   local maxline = params.maxline or 20
-   local errthres = params.errthres or 1e-4
-   local doFistaUpdate = params.doFistaUpdate
-   local verbose = params.verbose
-
-   -- temporary allocations
-   params.xkm = params.xkm or torch.Tensor()
-   params.y   = params.y   or torch.Tensor()
-   params.ply = params.ply or torch.Tensor()
-   local xkm = params.xkm  -- previous iteration
-   local y   = params.y    -- fista iteration
-   local ply = params.ply  -- soft shrinked y
-
-   -- we start from all zeros
-   local xk = xinit
-   xkm:resizeAs(xk):zero()
-   ply:resizeAs(xk):zero()
-   y:resizeAs(xk):zero()
-
-   local history = {} -- keep track of stuff
-   local niter = 0    -- number of iterations done
-   local converged = false  -- are we done?
-   local tk = 1      -- momentum param for FISTA
-   local tkp = 0
-
-
-   local gy = g(y)
-   local fval = math.huge -- fval = f+g
-   while not converged and niter < maxiter do
-
-      -- run through smooth function (code is input, input is target)
-      -- get derivatives from smooth function
-      local fy,gfy = f(y,'dx')
-      --local gfy = f(y)
-
-      local fply = 0
-      local gply = 0
-      local Q = 0
-
-      ----------------------------------------------
-      -- do line search to find new current location starting from fista loc
-      local nline = 0
-      local linesearchdone = false
-      while not linesearchdone do
-         -- take a step in gradient direction of smooth function
-         ply:copy(y)
-         ply:add(-1/L,gfy)
-
-         -- and solve for minimum of auxiliary problem
-         pl(ply,L)
-         -- this is candidate for new current iteration
-         xk:copy(ply)
-
-         -- evaluate this point F(ply)
-         fply = f(ply)
-
-         -- ply - y
-         ply:add(-1, y)
-         -- <ply-y , \Grad(f(y))>
-         local Q2 = gfy:dot(ply)
-         -- L/2 ||beta-y||^2
-         local Q3 = L/2 * ply:dot(ply)
-         -- Q(beta,y) = F(y) + <beta-y , \Grad(F(y))> + L/2||beta-y||^2 + G(beta)
-         Q = fy + Q2 + Q3
-
-         if verbose then
-            print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3))
-         end
-         -- check if F(beta) < Q(pl(y),\t)
-         if fply <= Q then --and Fply + Gply <= F then
-            -- now evaluate G here
-            linesearchdone = true
-         elseif  nline >= maxline then
-            linesearchdone = true
-            xk:copy(xkm) -- if we can't find a better point, current iter = previous iter
-            --print('oops')
-         else
-            L = L * Lstep
-         end
-         nline = nline + 1
-      end
-      -- end line search
-      ---------------------------------------------
-
-      ---------------------------------------------
-      -- FISTA
-      ---------------------------------------------
-      if doFistaUpdate then
-         -- do the FISTA step
-         tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2
-         -- x(k-1) = x(k-1) - x(k)
-         xkm:add(-1,xk)
-         -- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k))
-         y:copy(xk)
-         y:add( (1-tk)/tkp , xkm)
-         -- store for next iterations
-         -- x(k-1) = x(k)
-         xkm:copy(xk)
-      else
-         y:copy(xk)
-      end
-      -- t(k) = t(k+1)
-      tk = tkp
-      fply = f(y)
-      gply = g(y)
-      if verbose then
-	 print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply))
-      end
-
-      niter = niter + 1
-
-      -- bookeeping
-      fval = fply + gply
-      history[niter] = {}
-      history[niter].nline = nline
-      history[niter].L  = L
-      history[niter].F  = fval
-      history[niter].Fply = fply
-      history[niter].Gply = gply
-      history[niter].Q  = Q
-      params.L = L
-      if verbose then
-         history[niter].xk = xk:clone()
-         history[niter].y  = y:clone()
-      end
-
-      -- are we done?
-      if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then
-         converged = true
-	 xinit:copy(y)
-         return y,history
-      end
-
-      if niter >= maxiter then
-	 xinit:copy(y)
-         return y,history
-      end
-
-      --if niter > 1 and history[niter].F > history[niter-1].F then
-      --print(niter, 'This was supposed to be a convex function, we are going up')
-      --converged = true
-      --return xk,history
-      --end
-   end
-   error('not supposed to be here')
-end
-
diff --git a/contrib/torch/optim/init.lua b/contrib/torch/optim/init.lua
deleted file mode 100644
index a045bd8a2..000000000
--- a/contrib/torch/optim/init.lua
+++ /dev/null
@@ -1,33 +0,0 @@
-
-require 'torch'
-
-optim = {}
-
--- optimizations
-require('optim.sgd')
-require('optim.cg')
-require('optim.asgd')
-require('optim.nag')
-require('optim.fista')
-require('optim.lbfgs')
-require('optim.adagrad')
-require('optim.rprop')
-require('optim.adam')
-require('optim.adamax')
-require('optim.rmsprop')
-require('optim.adadelta')
-require('optim.cmaes')
-require('optim.de')
-
--- line search functions
-require('optim.lswolfe')
-
--- helpers
-require('optim.polyinterp')
-require('optim.checkgrad')
-
--- tools
-require('optim.ConfusionMatrix')
-require('optim.Logger')
-
-return optim
diff --git a/contrib/torch/optim/lbfgs.lua b/contrib/torch/optim/lbfgs.lua
deleted file mode 100644
index d850fcbb3..000000000
--- a/contrib/torch/optim/lbfgs.lua
+++ /dev/null
@@ -1,268 +0,0 @@
---[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt)
-
-This implementation of L-BFGS relies on a user-provided line
-search function (state.lineSearch). If this function is not
-provided, then a simple learningRate is used to produce fixed
-size steps. Fixed size steps are much less costly than line
-searches, and can be useful for stochastic problems.
-
-The learning rate is used even when a line search is provided.
-This is also useful for large-scale stochastic problems, where
-opfunc is a noisy approximation of f(x). In that case, the learning
-rate allows a reduction of confidence in the step size.
-
-ARGS:
-
-- `opfunc` : a function that takes a single input (X), the point of
-         evaluation, and returns f(X) and df/dX
-- `x` : the initial point
-- `state` : a table describing the state of the optimizer; after each
-         call the state is modified
-- `state.maxIter` : Maximum number of iterations allowed
-- `state.maxEval` : Maximum number of function evaluations
-- `state.tolFun` : Termination tolerance on the first-order optimality
-- `state.tolX` : Termination tol on progress in terms of func/param changes
-- `state.lineSearch` : A line search function
-- `state.learningRate` : If no line search provided, then a fixed step size is used
-
-RETURN:
-- `x*` : the new `x` vector, at the optimal point
-- `f`  : a table of all function values:
-     `f[1]` is the value of the function before any optimization and
-     `f[#f]` is the final fully optimized value, at `x*`
-
-(Clement Farabet, 2012)
-]]
-function optim.lbfgs(opfunc, x, config, state)
-   -- get/update state
-   local config = config or {}
-   local state = state or config
-   local maxIter = tonumber(config.maxIter) or 20
-   local maxEval = tonumber(config.maxEval) or maxIter*1.25
-   local tolFun = config.tolFun or 1e-5
-   local tolX = config.tolX or 1e-9
-   local nCorrection = config.nCorrection or 100
-   local lineSearch = config.lineSearch
-   local lineSearchOpts = config.lineSearchOptions
-   local learningRate = config.learningRate or 1
-   local isverbose = config.verbose or false
-
-   state.funcEval = state.funcEval or 0
-   state.nIter = state.nIter or 0
-
-   -- verbose function
-   local verbose
-   if isverbose then
-      verbose = function(...) print('<optim.lbfgs> ', ...) end
-   else
-      verbose = function() end
-   end
-
-   -- import some functions
-   local abs = math.abs
-   local min = math.min
-
-   -- evaluate initial f(x) and df/dx
-   local f,g = opfunc(x)
-   local f_hist = {f}
-   local currentFuncEval = 1
-   state.funcEval = state.funcEval + 1
-   local p = g:size(1)
-
-   -- check optimality of initial point
-   state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1
-   tmp1:copy(g):abs()
-   if tmp1:sum() <= tolFun then
-      -- optimality condition below tolFun
-      verbose('optimality condition below tolFun')
-      return x,f_hist
-   end
-
-   if not state.dir_bufs then
-      -- reusable buffers for y's and s's, and their histories
-      verbose('creating recyclable direction/step/history buffers')
-      state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1)
-      state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1)
-      for i=1,#state.dir_bufs do
-         state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1)
-         state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1)
-      end
-   end
-
-   -- variables cached in state (for tracing)
-   local d = state.d
-   local t = state.t
-   local old_dirs = state.old_dirs
-   local old_stps = state.old_stps
-   local Hdiag = state.Hdiag
-   local g_old = state.g_old
-   local f_old = state.f_old
-
-   -- optimize for a max of maxIter iterations
-   local nIter = 0
-   while nIter < maxIter do
-      -- keep track of nb of iterations
-      nIter = nIter + 1
-      state.nIter = state.nIter + 1
-
-      ------------------------------------------------------------
-      -- compute gradient descent direction
-      ------------------------------------------------------------
-      if state.nIter == 1 then
-         d = g:clone():mul(-1) -- -g
-         old_dirs = {}
-         old_stps = {}
-         Hdiag = 1
-      else
-         -- do lbfgs update (update memory)
-         local y = table.remove(state.dir_bufs)  -- pop
-         local s = table.remove(state.stp_bufs)
-         y:add(g, -1, g_old)  -- g - g_old
-         s:mul(d, t)          -- d*t
-         local ys = y:dot(s)  -- y*s
-         if ys > 1e-10 then
-            -- updating memory
-            if #old_dirs == nCorrection then
-               -- shift history by one (limited-memory)
-               local removed1 = table.remove(old_dirs, 1)
-               local removed2 = table.remove(old_stps, 1)
-               table.insert(state.dir_bufs, removed1)
-               table.insert(state.stp_bufs, removed2)
-            end
-
-            -- store new direction/step
-            table.insert(old_dirs, s)
-            table.insert(old_stps, y)
-
-            -- update scale of initial Hessian approximation
-            Hdiag = ys / y:dot(y)  -- (y*y)
-         else
-            -- put y and s back into the buffer pool
-            table.insert(state.dir_bufs, y)
-            table.insert(state.stp_bufs, s)
-         end
-
-         -- compute the approximate (L-BFGS) inverse Hessian
-         -- multiplied by the gradient
-         local k = #old_dirs
-
-         -- need to be accessed element-by-element, so don't re-type tensor:
-         state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro
-         for i = 1,k do
-            ro[i] = 1 / old_stps[i]:dot(old_dirs[i])
-         end
-
-         -- iteration in L-BFGS loop collapsed to use just one buffer
-         local q = tmp1  -- reuse tmp1 for the q buffer
-         -- need to be accessed element-by-element, so don't re-type tensor:
-         state.al = state.al or torch.zeros(nCorrection) local al = state.al
-
-         q:mul(g, -1)  -- -g
-         for i = k,1,-1 do
-            al[i] = old_dirs[i]:dot(q) * ro[i]
-            q:add(-al[i], old_stps[i])
-         end
-
-         -- multiply by initial Hessian
-         r = d  -- share the same buffer, since we don't need the old d
-         r:mul(q, Hdiag)  -- q[1] * Hdiag
-         for i = 1,k do
-            local be_i = old_stps[i]:dot(r) * ro[i]
-            r:add(al[i]-be_i, old_dirs[i])
-         end
-         -- final direction is in r/d (same object)
-      end
-      g_old = g_old or g:clone()
-      g_old:copy(g)
-      f_old = f
-
-      ------------------------------------------------------------
-      -- compute step length
-      ------------------------------------------------------------
-      -- directional derivative
-      local gtd = g:dot(d)  -- g * d
-
-      -- check that progress can be made along that direction
-      if gtd > -tolX then
-         break
-      end
-
-      -- reset initial guess for step size
-      if state.nIter == 1 then
-         tmp1:copy(g):abs()
-         t = min(1,1/tmp1:sum()) * learningRate
-      else
-         t = learningRate
-      end
-
-      -- optional line search: user function
-      local lsFuncEval = 0
-      if lineSearch and type(lineSearch) == 'function' then
-         -- perform line search, using user function
-         f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts)
-         table.insert(f_hist, f)
-      else
-         -- no line search, simply move with fixed-step
-         x:add(t,d)
-         if nIter ~= maxIter then
-            -- re-evaluate function only if not in last iteration
-            -- the reason we do this: in a stochastic setting,
-            -- no use to re-evaluate that function here
-            f,g = opfunc(x)
-            lsFuncEval = 1
-            table.insert(f_hist, f)
-         end
-      end
-
-      -- update func eval
-      currentFuncEval = currentFuncEval + lsFuncEval
-      state.funcEval = state.funcEval + lsFuncEval
-
-      ------------------------------------------------------------
-      -- check conditions
-      ------------------------------------------------------------
-      if nIter == maxIter then
-         -- no use to run tests
-         verbose('reached max number of iterations')
-         break
-      end
-
-      if currentFuncEval >= maxEval then
-         -- max nb of function evals
-         verbose('max nb of function evals')
-         break
-      end
-
-      tmp1:copy(g):abs()
-      if tmp1:sum() <= tolFun then
-         -- check optimality
-         verbose('optimality condition below tolFun')
-         break
-      end
-
-      tmp1:copy(d):mul(t):abs()
-      if tmp1:sum() <= tolX then
-         -- step size below tolX
-         verbose('step size below tolX')
-         break
-      end
-
-      if abs(f-f_old) < tolX then
-         -- function value changing less than tolX
-         verbose('function value changing less than tolX')
-         break
-      end
-   end
-
-   -- save state
-   state.old_dirs = old_dirs
-   state.old_stps = old_stps
-   state.Hdiag = Hdiag
-   state.g_old = g_old
-   state.f_old = f_old
-   state.t = t
-   state.d = d
-
-   -- return optimal x, and history of f(x)
-   return x,f_hist,currentFuncEval
-end
diff --git a/contrib/torch/optim/lswolfe.lua b/contrib/torch/optim/lswolfe.lua
deleted file mode 100644
index 0afbdbe8b..000000000
--- a/contrib/torch/optim/lswolfe.lua
+++ /dev/null
@@ -1,192 +0,0 @@
---[[ A Line Search satisfying the Wolfe conditions
-
-ARGS:
-- `opfunc` : a function (the objective) that takes a single input (X),
-         the point of evaluation, and returns f(X) and df/dX
-- `x`          : initial point / starting location
-- `t`          : initial step size
-- `d`          : descent direction
-- `f`          : initial function value
-- `g`          : gradient at initial location
-- `gtd`        : directional derivative at starting location
-- `options.c1` : sufficient decrease parameter
-- `options.c2` : curvature parameter
-- `options.tolX`    : minimum allowable step length
-- `options.maxIter` : maximum nb of iterations
-
-RETURN:
-- `f`          : function value at x+t*d
-- `g`          : gradient value at x+t*d
-- `x`          : the next x (=x+t*d)
-- `t`          : the step length
-- `lsFuncEval` : the number of function evaluations
-]]
-function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options)
-   -- options
-   options = options or {}
-   local c1 = options.c1 or 1e-4
-   local c2 = options.c2 or 0.9
-   local tolX = options.tolX or 1e-9
-   local maxIter = options.maxIter or 20
-   local isverbose = options.verbose or false
-
-   -- some shortcuts
-   local abs = torch.abs
-   local min = math.min
-   local max = math.max
-
-   -- verbose function
-   local function verbose(...)
-      if isverbose then print('<optim.lswolfe> ', ...) end
-   end
-
-   -- evaluate objective and gradient using initial step
-   local x_init = x:clone()
-   x:add(t,d)
-   local f_new,g_new = opfunc(x)
-   local lsFuncEval = 1
-   local gtd_new = g_new * d
-
-   -- bracket an interval containing a point satisfying the Wolfe
-   -- criteria
-   local LSiter,t_prev,done = 0,0,false
-   local f_prev,g_prev,gtd_prev = f,g:clone(),gtd
-   local bracket,bracketFval,bracketGval
-   while LSiter < maxIter do
-      -- check conditions:
-      if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then
-         bracket = x.new{t_prev,t}
-         bracketFval = x.new{f_prev,f_new}
-         bracketGval = x.new(2,g_new:size(1))
-         bracketGval[1] = g_prev
-         bracketGval[2] = g_new
-         break
-
-      elseif abs(gtd_new) <= -c2*gtd then
-         bracket = x.new{t}
-         bracketFval = x.new{f_new}
-         bracketGval = x.new(1,g_new:size(1))
-         bracketGval[1] = g_new
-         done = true
-         break
-
-      elseif gtd_new >= 0 then
-         bracket = x.new{t_prev,t}
-         bracketFval = x.new{f_prev,f_new}
-         bracketGval = x.new(2,g_new:size(1))
-         bracketGval[1] = g_prev
-         bracketGval[2] = g_new
-         break
-
-      end
-
-      -- interpolate:
-      local tmp = t_prev
-      t_prev = t
-      local minStep = t + 0.01*(t-tmp)
-      local maxStep = t*10
-      t = optim.polyinterp(x.new{{tmp,f_prev,gtd_prev},
-                                  {t,f_new,gtd_new}},
-                           minStep, maxStep)
-
-      -- next step:
-      f_prev = f_new
-      g_prev = g_new:clone()
-      gtd_prev = gtd_new
-      x[{}] = x_init
-      x:add(t,d)
-      f_new,g_new = opfunc(x)
-      lsFuncEval = lsFuncEval + 1
-      gtd_new = g_new * d
-      LSiter = LSiter + 1
-   end
-
-   -- reached max nb of iterations?
-   if LSiter == maxIter then
-      bracket = x.new{0,t}
-      bracketFval = x.new{f,f_new}
-      bracketGval = x.new(2,g_new:size(1))
-      bracketGval[1] = g
-      bracketGval[2] = g_new
-   end
-
-   -- zoom phase: we now have a point satisfying the criteria, or
-   -- a bracket around it. We refine the bracket until we find the
-   -- exact point satisfying the criteria
-   local insufProgress = false
-   local LOposRemoved = 0
-   while not done and LSiter < maxIter do
-      -- find high and low points in bracket
-      local f_LO,LOpos = bracketFval:min(1)
-      LOpos = LOpos[1] f_LO = f_LO[1]
-      local HIpos = -LOpos+3
-
-      -- compute new trial value
-      t = optim.polyinterp(x.new{{bracket[1],bracketFval[1],bracketGval[1]*d},
-                                  {bracket[2],bracketFval[2],bracketGval[2]*d}})
-
-      -- test what we are making sufficient progress
-      if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then
-         if insufProgress or t>=bracket:max() or t <= bracket:min() then
-            if abs(t-bracket:max()) < abs(t-bracket:min()) then
-               t = bracket:max()-0.1*(bracket:max()-bracket:min())
-            else
-               t = bracket:min()+0.1*(bracket:max()-bracket:min())
-            end
-            insufProgress = false
-         else
-            insufProgress = true
-         end
-      else
-         insufProgress = false
-      end
-
-      -- Evaluate new point
-      x[{}] = x_init
-      x:add(t,d)
-      f_new,g_new = opfunc(x)
-      lsFuncEval = lsFuncEval + 1
-      gtd_new = g_new * d
-      LSiter = LSiter + 1
-      if f_new > f + c1*t*gtd or f_new >= f_LO then
-         -- Armijo condition not satisfied or not lower than lowest point
-         bracket[HIpos] = t
-         bracketFval[HIpos] = f_new
-         bracketGval[HIpos] = g_new
-      else
-         if abs(gtd_new) <= - c2*gtd then
-            -- Wolfe conditions satisfied
-            done = true
-         elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then
-            -- Old HI becomes new LO
-            bracket[HIpos] = bracket[LOpos]
-            bracketFval[HIpos] = bracketFval[LOpos]
-            bracketGval[HIpos] = bracketGval[LOpos]
-         end
-         -- New point becomes new LO
-         bracket[LOpos] = t
-         bracketFval[LOpos] = f_new
-         bracketGval[LOpos] = g_new
-      end
-
-      -- done?
-      if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then
-         break
-      end
-   end
-
-   -- be verbose
-   if LSiter == maxIter then
-      verbose('reached max number of iterations')
-   end
-
-   -- return stuff
-   local _,LOpos = bracketFval:min(1)
-   LOpos = LOpos[1]
-   t = bracket[LOpos]
-   f_new = bracketFval[LOpos]
-   g_new = bracketGval[LOpos]
-   x[{}] = x_init
-   x:add(t,d)
-	return f_new,g_new,x,t,lsFuncEval
-end
diff --git a/contrib/torch/optim/nag.lua b/contrib/torch/optim/nag.lua
deleted file mode 100644
index 875d81e4c..000000000
--- a/contrib/torch/optim/nag.lua
+++ /dev/null
@@ -1,86 +0,0 @@
-----------------------------------------------------------------------
--- An implementation of SGD adapted with features of Nesterov's
--- Accelerated Gradient method, based on the paper
--- On the Importance of Initialization and Momentum in Deep Learning
--- Sutsveker et. al., ICML 2013
---
--- ARGS:
--- opfunc : a function that takes a single input (X), the point of
---          evaluation, and returns f(X) and df/dX
--- x      : the initial point
--- state  : a table describing the state of the optimizer; after each
---          call the state is modified
---   state.learningRate      : learning rate
---   state.learningRateDecay : learning rate decay
---   state.weightDecay       : weight decay
---   state.momentum          : momentum
---   state.learningRates     : vector of individual learning rates
---
--- RETURN:
--- x     : the new x vector
--- f(x)  : the function, evaluated before the update
---
--- (Dilip Krishnan, 2013)
---
-
-function optim.nag(opfunc, x, config, state)
-   -- (0) get/update state
-   local config = config or {}
-   local state = state or config
-   local lr = config.learningRate or 1e-3
-   local lrd = config.learningRateDecay or 0
-   local wd = config.weightDecay or 0
-   local mom = config.momentum or 0.9
-   local damp = config.dampening or mom
-   local lrs = config.learningRates
-   state.evalCounter = state.evalCounter or 0
-   local nevals = state.evalCounter
-
-   if mom <= 0 then
-     error('Momentum must be positive for Nesterov Accelerated Gradient')
-   end
-
-   -- (1) evaluate f(x) and df/dx
-   -- first step in the direction of the momentum vector
-
-   if state.dfdx then
-      x:add(mom, state.dfdx)
-   end
-   -- then compute gradient at that point
-   -- comment out the above line to get the original SGD
-   local fx,dfdx = opfunc(x)
-
-   -- (2) weight decay
-   if wd ~= 0 then
-      dfdx:add(wd, x)
-   end
-
-   -- (3) learning rate decay (annealing)
-   local clr = lr / (1 + nevals*lrd)
-
-   -- (4) apply momentum
-   if not state.dfdx then
-      state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0)
-   else
-      state.dfdx:mul(mom)
-   end
-
-   -- (5) parameter update with single or individual learning rates
-   if lrs then
-      if not state.deltaParameters then
-         state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
-      end
-      state.deltaParameters:copy(lrs):cmul(dfdx)
-      x:add(-clr, state.deltaParameters)
-      state.dfdx:add(-clr, state.deltaParameters)
-   else
-      x:add(-clr, dfdx)
-      state.dfdx:add(-clr, dfdx)
-   end
-
-   -- (6) update evaluation counter
-   state.evalCounter = state.evalCounter + 1
-
-   -- return x, f(x) before optimization
-   return x,{fx}
-end
diff --git a/contrib/torch/optim/polyinterp.lua b/contrib/torch/optim/polyinterp.lua
deleted file mode 100644
index c5026bf04..000000000
--- a/contrib/torch/optim/polyinterp.lua
+++ /dev/null
@@ -1,212 +0,0 @@
-local function isreal(x)
-   return x == x
-end
-
-local function isnan(x)
-   return not x == x
-end
-
-local function roots(c)
-   local tol=1e-12
-   c[torch.lt(torch.abs(c),tol)]=0
-
-   local nonzero = torch.ne(c,0)
-   if nonzero:max() == 0 then
-      return 0
-   end
-
-   -- first non-zero
-   local _,pos = torch.max(nonzero,1)
-   pos = pos[1]
-   c=c[{ {pos,-1} }]
-
-   local nz = 0
-   for i=c:size(1),1,-1 do
-      if c[i] ~= 0 then
-         break
-      else
-         nz = nz + 1
-      end
-   end
-   c=c[{ {1,c:size(1)-nz} }]
-
-   local n = c:size(1)-1
-   if n == 1 then
-      local e = c.new({{-c[2]/c[1], 0}})
-      if nz > 0 then
-         return torch.cat(e, c.new(nz, 2):zero(), 1)
-      else
-         return e
-      end
-   elseif n > 1 then
-      local A = torch.diag(c.new(n-1):fill(1),-1)
-      A[1] = -c[{ {2,n+1} }]/c[1];
-      local e = torch.eig(A,'N')
-      if nz > 0 then
-         return torch.cat(e, c.new(nz,2):zero(), 1)
-      else
-         return e
-      end
-   else
-      return c.new(nz,2):zero()
-   end
-end
-
-local function real(x)
-   if type(x) == number then return x end
-   return x[{ {} , 1}]
-end
-
-local function imag(x)
-   if type(x) == 'number' then return 0 end
-   if x:nDimension() == 1 then
-      return x.new(x:size(1)):zero()
-   else
-      return x[{ {},  2}]
-   end
-end
-
-local function polyval(p,x)
-   local pwr = p:size(1)
-   if type(x) == 'number' then
-      local val = 0
-      p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end)
-      return val
-   else
-      local val = x.new(x:size(1))
-      p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end)
-      return val
-   end
-end
-
-----------------------------------------------------------------------
--- Minimum of interpolating polynomial based on function and
--- derivative values
---
--- ARGS:
--- points : N triplets (x,f,g), must be a Tensor
--- xmin   : min value that brackets minimum (default: min of points)
--- xmax   : max value that brackets maximum (default: max of points)
---
--- RETURN:
--- minPos : position of minimum
---
-function optim.polyinterp(points,xminBound,xmaxBound)
-   -- locals
-   local sqrt = torch.sqrt
-   local mean = torch.mean
-   local max = math.max
-   local min = math.min
-
-   -- nb of points / order of polynomial
-   local nPoints = points:size(1)
-   local order = nPoints*2-1
-
-   -- returned values
-   local minPos
-
-   -- Code for most common case:
-   --   + cubic interpolation of 2 points w/ function and derivative values for both
-   --   + no xminBound/xmaxBound
-   if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then
-      -- Solution in this case (where x2 is the farthest point):
-      --    d1 = g1 + g2 - 3*(f1-f2)/(x1-x2);
-      --    d2 = sqrt(d1^2 - g1*g2);
-      --    minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2));
-      --    t_new = min(max(minPos,x1),x2);
-      local minVal,minPos = points[{ {},1 }]:min(1)
-      minVal = minVal[1] minPos = minPos[1]
-      local notMinPos = -minPos+3;
-
-      local d1 = points[{minPos,3}] + points[{notMinPos,3}]
-               - 3*(points[{minPos,2}]-points[{notMinPos,2}])
-                     / (points[{minPos,1}]-points[{notMinPos,1}]);
-      local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]);
-
-      if isreal(d2) then -- isreal()
-         local t = points[{notMinPos,1}] - (points[{notMinPos,1}]
-                   - points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1)
-                     / (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2))
-
-         minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}])
-      else
-         minPos = mean(points[{{},1}])
-      end
-      return minPos
-   end
-
-   -- TODO: get the code below to work!
-   --error('<optim.polyinterp> extrapolation not implemented yet...')
-
-   -- Compute Bounds of Interpolation Area
-   local xmin = points[{{},1}]:min()
-   local xmax = points[{{},1}]:max()
-   xminBound = xminBound or xmin
-   xmaxBound = xmaxBound or xmax
-
-   -- Add constraints on function values
-   local A = points.new(nPoints*2,order+1):zero()
-   local b = points.new(nPoints*2,1):zero()
-   for i = 1,nPoints do
-      local constraint = points.new(order+1):zero()
-      for j = order,0,-1 do
-         constraint[order-j+1] = points[{i,1}]^j
-      end
-      A[i] = constraint
-      b[i] = points[{i,2}]
-   end
-
-   -- Add constraints based on derivatives
-   for i = 1,nPoints do
-      local constraint = points.new(order+1):zero()
-      for j = 1,order do
-         constraint[j] = (order-j+1)*points[{i,1}]^(order-j)
-      end
-      A[nPoints+i] = constraint
-      b[nPoints+i] = points[{i,3}]
-   end
-
-   -- Find interpolating polynomial
-   local res = torch.gels(b,A)
-   local params = res[{ {1,nPoints*2} }]:squeeze()
-
-   params[torch.le(torch.abs(params),1e-12)]=0
-
-   -- Compute Critical Points
-   local dParams = points.new(order):zero();
-   for i = 1,params:size(1)-1 do
-      dParams[i] = params[i]*(order-i+1)
-   end
-
-   -- nan/inf?
-   local nans = false
-   if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then
-      nans = true
-   end
-
-   local cp = torch.cat(points.new{xminBound,xmaxBound},points[{{},1}])
-   if not nans then
-      local cproots = roots(dParams)
-      local cpi = points.new(cp:size(1),2):zero()
-      cpi[{ {1,cp:size(1)} , 1 }] = cp
-      cp = torch.cat(cpi,cproots,1)
-   end
-
-   -- Test Critical Points
-   local fmin = math.huge
-   -- Default to Bisection if no critical points valid:
-   minPos = (xminBound+xmaxBound)/2
-   for i = 1,cp:size(1) do
-      local xCP = cp[{ {i,i} , {} }]
-      local ixCP = imag(xCP)[1]
-      local rxCP = real(xCP)[1]
-      if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then
-         local fCP = polyval(params,rxCP)
-         if fCP < fmin then
-            minPos = rxCP
-            fmin = fCP
-         end
-      end
-   end
-   return minPos,fmin
-end
diff --git a/contrib/torch/optim/rmsprop.lua b/contrib/torch/optim/rmsprop.lua
deleted file mode 100644
index aa562006a..000000000
--- a/contrib/torch/optim/rmsprop.lua
+++ /dev/null
@@ -1,58 +0,0 @@
---[[ An implementation of RMSprop
-
-ARGS:
-
-- 'opfunc' : a function that takes a single input (X), the point
-             of a evaluation, and returns f(X) and df/dX
-- 'x'      : the initial point
-- 'config` : a table with configuration parameters for the optimizer
-- 'config.learningRate'      : learning rate
-- 'config.alpha'             : smoothing constant
-- 'config.epsilon'           : value with which to initialise m
-- 'config.weightDecay'       : weight decay
-- 'state'                    : a table describing the state of the optimizer;
-                               after each call the state is modified
-- 'state.m'                  : leaky sum of squares of parameter gradients,
-- 'state.tmp'                : and the square root (with epsilon smoothing)
-
-RETURN:
-- `x`     : the new x vector
-- `f(x)`  : the function, evaluated before the update
-
-]]
-
-function optim.rmsprop(opfunc, x, config, state)
-   -- (0) get/update state
-   local config = config or {}
-   local state = state or config
-   local lr = config.learningRate or 1e-2
-   local alpha = config.alpha or 0.99
-   local epsilon = config.epsilon or 1e-8
-   local wd = config.weightDecay or 0
-   local mfill = config.initialMean or 0
-
-   -- (1) evaluate f(x) and df/dx
-   local fx, dfdx = opfunc(x)
-
-   -- (2) weight decay
-   if wd ~= 0 then
-      dfdx:add(wd, x)
-   end
-
-   -- (3) initialize mean square values and square gradient storage
-   if not state.m then
-      state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(mfill)
-      state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx)
-   end
-
-   -- (4) calculate new (leaky) mean squared values
-   state.m:mul(alpha)
-   state.m:addcmul(1.0-alpha, dfdx, dfdx)
-
-   -- (5) perform update
-   state.tmp:sqrt(state.m):add(epsilon)
-   x:addcdiv(-lr, dfdx, state.tmp)
-
-   -- return x*, f(x) before optimization
-   return x, {fx}
-end
diff --git a/contrib/torch/optim/rprop.lua b/contrib/torch/optim/rprop.lua
deleted file mode 100644
index d7af16429..000000000
--- a/contrib/torch/optim/rprop.lua
+++ /dev/null
@@ -1,103 +0,0 @@
---[[ A plain implementation of RPROP
-
-ARGS:
-- `opfunc` : a function that takes a single input (X), the point of
-             evaluation, and returns f(X) and df/dX
-- `x`      : the initial point
-- `state`  : a table describing the state of the optimizer; after each
-             call the state is modified
-- `state.stepsize`    : initial step size, common to all components
-- `state.etaplus`     : multiplicative increase factor, > 1 (default 1.2)
-- `state.etaminus`    : multiplicative decrease factor, < 1 (default 0.5)
-- `state.stepsizemax` : maximum stepsize allowed (default 50)
-- `state.stepsizemin` : minimum stepsize allowed (default 1e-6)
-- `state.niter`       : number of iterations (default 1)
-
-RETURN:
-- `x`     : the new x vector
-- `f(x)`  : the function, evaluated before the update
-
-(Martin Riedmiller, Koray Kavukcuoglu 2013)
---]]
-function optim.rprop(opfunc, x, config, state)
-   if config == nil and state == nil then
-      print('no state table RPROP initializing')
-   end
-   -- (0) get/update state
-   local config = config or {}
-   local state = state or config
-   local stepsize = config.stepsize or 0.1
-   local etaplus = config.etaplus or 1.2
-   local etaminus = config.etaminus or 0.5
-   local stepsizemax = config.stepsizemax or 50.0
-   local stepsizemin = config.stepsizemin or 1E-06
-   local niter = config.niter or 1
-
-   local hfx = {}
-
-   for i=1,niter do
-
-      -- (1) evaluate f(x) and df/dx
-      local fx,dfdx = opfunc(x)
-
-      -- init temp storage
-      if not state.delta then
-         state.delta    = dfdx.new(dfdx:size()):zero()
-         state.stepsize = dfdx.new(dfdx:size()):fill(stepsize)
-         state.sign     = dfdx.new(dfdx:size())
-         state.psign    = torch.ByteTensor(dfdx:size())
-         state.nsign    = torch.ByteTensor(dfdx:size())
-         state.zsign    = torch.ByteTensor(dfdx:size())
-         state.dminmax  = torch.ByteTensor(dfdx:size())
-         if torch.type(x)=='torch.CudaTensor' then
-            -- Push to GPU
-            state.psign    = state.psign:cuda()
-            state.nsign    = state.nsign:cuda()
-            state.zsign    = state.zsign:cuda()
-            state.dminmax  = state.dminmax:cuda()
-         end
-      end
-
-      -- sign of derivative from last step to this one
-      torch.cmul(state.sign, dfdx, state.delta)
-      torch.sign(state.sign, state.sign)
-
-      -- get indices of >0, <0 and ==0 entries
-      state.sign.gt(state.psign, state.sign, 0)
-      state.sign.lt(state.nsign, state.sign, 0)
-      state.sign.eq(state.zsign, state.sign, 0)
-
-      -- get step size updates
-      state.sign[state.psign] = etaplus
-      state.sign[state.nsign] = etaminus
-      state.sign[state.zsign] = 1
-
-      -- update stepsizes with step size updates
-      state.stepsize:cmul(state.sign)
-
-      -- threshold step sizes
-      -- >50 => 50
-      state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax)
-      state.stepsize[state.dminmax] = stepsizemax
-      -- <1e-6 ==> 1e-6
-      state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin)
-      state.stepsize[state.dminmax] = stepsizemin
-
-      -- for dir<0, dfdx=0
-      -- for dir>=0 dfdx=dfdx
-      dfdx[state.nsign] = 0
-      -- state.sign = sign(dfdx)
-      torch.sign(state.sign,dfdx)
-
-      -- update weights
-      x:addcmul(-1,state.sign,state.stepsize)
-
-      -- update state.dfdx with current dfdx
-      state.delta:copy(dfdx)
-
-      table.insert(hfx,fx)
-   end
-
-   -- return x*, f(x) before optimization
-   return x,hfx
-end
diff --git a/contrib/torch/optim/sgd.lua b/contrib/torch/optim/sgd.lua
deleted file mode 100644
index e21c696a6..000000000
--- a/contrib/torch/optim/sgd.lua
+++ /dev/null
@@ -1,90 +0,0 @@
---[[ A plain implementation of SGD
-
-ARGS:
-
-- `opfunc` : a function that takes a single input (X), the point
-             of a evaluation, and returns f(X) and df/dX
-- `x`      : the initial point
-- `config` : a table with configuration parameters for the optimizer
-- `config.learningRate`      : learning rate
-- `config.learningRateDecay` : learning rate decay
-- `config.weightDecay`       : weight decay
-- `config.weightDecays`      : vector of individual weight decays
-- `config.momentum`          : momentum
-- `config.dampening`         : dampening for momentum
-- `config.nesterov`          : enables Nesterov momentum
-- `config.learningRates`     : vector of individual learning rates
-- `state`  : a table describing the state of the optimizer; after each
-             call the state is modified
-- `state.evalCounter`        : evaluation counter (optional: 0, by default)
-
-RETURN:
-- `x`     : the new x vector
-- `f(x)`  : the function, evaluated before the update
-
-(Clement Farabet, 2012)
-]]
-function optim.sgd(opfunc, x, config, state)
-   -- (0) get/update state
-   local config = config or {}
-   local state = state or config
-   local lr = config.learningRate or 1e-3
-   local lrd = config.learningRateDecay or 0
-   local wd = config.weightDecay or 0
-   local mom = config.momentum or 0
-   local damp = config.dampening or mom
-   local nesterov = config.nesterov or false
-   local lrs = config.learningRates
-   local wds = config.weightDecays
-   state.evalCounter = state.evalCounter or 0
-   local nevals = state.evalCounter
-   assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening")
-
-   -- (1) evaluate f(x) and df/dx
-   local fx,dfdx = opfunc(x)
-
-   -- (2) weight decay with single or individual parameters
-   if wd ~= 0 then
-      dfdx:add(wd, x)
-   elseif wds then
-      if not state.decayParameters then
-         state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
-      end
-      state.decayParameters:copy(wds):cmul(x)
-      dfdx:add(state.decayParameters)
-   end
-
-   -- (3) apply momentum
-   if mom ~= 0 then
-      if not state.dfdx then
-         state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx)
-      else
-         state.dfdx:mul(mom):add(1-damp, dfdx)
-      end
-      if nesterov then
-         dfdx:add(mom, state.dfdx)
-      else
-         dfdx = state.dfdx
-      end
-   end
-
-   -- (4) learning rate decay (annealing)
-   local clr = lr / (1 + nevals*lrd)
-
-   -- (5) parameter update with single or individual learning rates
-   if lrs then
-      if not state.deltaParameters then
-         state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
-      end
-      state.deltaParameters:copy(lrs):cmul(dfdx)
-      x:add(-clr, state.deltaParameters)
-   else
-      x:add(-clr, dfdx)
-   end
-
-   -- (6) update evaluation counter
-   state.evalCounter = state.evalCounter + 1
-
-   -- return x*, f(x) before optimization
-   return x,{fx}
-end
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-05-23 18:14:15 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-05-23 18:14:15 +0100
commit	714eb56e1760fdfb26afccde92664d3a2f1e8435 (patch)
tree	84d1399acbb92f852b4bd64f9ea5412680b0c6ab /contrib/torch/optim
parent	220a51ff68013dd668a45b78c60a7b8bfc10f074 (diff)
download	rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.tar.gz rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.zip